1/*
2 *******************************************************************************
3 * Copyright (C) 2009-2015, Google, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *******************************************************************************
6 */
7package com.ibm.icu.impl;
8
9import java.io.BufferedReader;
10import java.io.FileInputStream;
11import java.io.IOException;
12import java.io.InputStream;
13import java.io.InputStreamReader;
14import java.io.UnsupportedEncodingException;
15import java.text.ParsePosition;
16import java.util.Arrays;
17import java.util.Comparator;
18import java.util.LinkedHashSet;
19import java.util.List;
20import java.util.Map;
21import java.util.Map.Entry;
22import java.util.Set;
23import java.util.TreeMap;
24import java.util.regex.Pattern;
25
26import com.ibm.icu.text.StringTransform;
27import com.ibm.icu.text.SymbolTable;
28import com.ibm.icu.text.UnicodeSet;
29import com.ibm.icu.util.Freezable;
30
31/**
32 * Contains utilities to supplement the JDK Regex, since it doesn't handle
33 * Unicode well.
34 *
35 * @author markdavis
36 */
37public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
38    // Note: we don't currently have any state, but intend to in the future,
39    // particularly for the regex style supported.
40
41    private SymbolTable symbolTable;
42
43    /**
44     * Set the symbol table for internal processing
45     * @internal
46     */
47    public SymbolTable getSymbolTable() {
48        return symbolTable;
49    }
50
51    /**
52     * Get the symbol table for internal processing
53     * @internal
54     */
55    public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
56        this.symbolTable = symbolTable;
57        return this;
58    }
59
60    /**
61     * Adds full Unicode property support, with the latest version of Unicode,
62     * to Java Regex, bringing it up to Level 1 (see
63     * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
64     * regex pattern string and interpreting the character classes (\p{...},
65     * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
66     * this utility, Java regex expressions can be updated to work with the
67     * latest version of Unicode, and with all Unicode properties. Note that the
68     * UnicodeSet syntax has not yet, however, been updated to be completely
69     * consistent with Java regex, so be careful of the differences.
70     * <p>Not thread-safe; create a separate copy for different threads.
71     * <p>In the future, we may extend this to support other regex packages.
72     *
73     * @regex A modified Java regex pattern, as in the input to
74     *        Pattern.compile(), except that all "character classes" are
75     *        processed as if they were UnicodeSet patterns. Example:
76     *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
77     * @return A processed Java regex pattern, suitable for input to
78     *         Pattern.compile().
79     */
80    public String transform(String regex) {
81        StringBuilder result = new StringBuilder();
82        UnicodeSet temp = new UnicodeSet();
83        ParsePosition pos = new ParsePosition(0);
84        int state = 0; // 1 = after \
85
86        // We add each character unmodified to the output, unless we have a
87        // UnicodeSet. Note that we don't worry about supplementary characters,
88        // since none of the syntax uses them.
89
90        for (int i = 0; i < regex.length(); ++i) {
91            // look for UnicodeSets, allowing for quoting with \ and \Q
92            char ch = regex.charAt(i);
93            switch (state) {
94            case 0: // we only care about \, and '['.
95                if (ch == '\\') {
96                    if (UnicodeSet.resemblesPattern(regex, i)) {
97                        // should only happen with \p
98                        i = processSet(regex, i, result, temp, pos);
99                        continue;
100                    }
101                    state = 1;
102                } else if (ch == '[') {
103                    // if we have what looks like a UnicodeSet
104                    if (UnicodeSet.resemblesPattern(regex, i)) {
105                        i = processSet(regex, i, result, temp, pos);
106                        continue;
107                    }
108                }
109                break;
110
111            case 1: // we are after a \
112                if (ch == 'Q') {
113                    state = 1;
114                } else {
115                    state = 0;
116                }
117                break;
118
119            case 2: // we are in a \Q...
120                if (ch == '\\') {
121                    state = 3;
122                }
123                break;
124
125            case 3: // we are in at \Q...\
126                if (ch == 'E') {
127                    state = 0;
128                }
129                state = 2;
130                break;
131            }
132            result.append(ch);
133        }
134        return result.toString();
135    }
136
137    /**
138     * Convenience static function, using standard parameters.
139     * @param regex as in process()
140     * @return processed regex pattern, as in process()
141     */
142    public static String fix(String regex) {
143        return STANDARD.transform(regex);
144    }
145
146    /**
147     * Compile a regex string, after processing by fix(...).
148     *
149     * @param regex Raw regex pattern, as in fix(...).
150     * @return Pattern
151     */
152    public static Pattern compile(String regex) {
153        return Pattern.compile(STANDARD.transform(regex));
154    }
155
156    /**
157     * Compile a regex string, after processing by fix(...).
158     *
159     * @param regex Raw regex pattern, as in fix(...).
160     * @return Pattern
161     */
162    public static Pattern compile(String regex, int options) {
163        return Pattern.compile(STANDARD.transform(regex), options);
164    }
165
166    /**
167     * Compile a composed string from a set of BNF lines; see the List version for more information.
168     *
169     * @param bnfLines Series of BNF lines.
170     * @return Pattern
171     */
172    public String compileBnf(String bnfLines) {
173        return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
174    }
175
176    /**
177     * Compile a composed string from a set of BNF lines, such as for composing a regex
178     * expression. The lines can be in any order, but there must not be any
179     * cycles. The result can be used as input for fix().
180     * <p>
181     * Example:
182     * <pre>
183     * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
184     * scheme = reserved+;
185     * host = // reserved+;
186     * query = [\\=reserved]+;
187     * fragment = reserved+;
188     * reserved = [[:ascii:][:alphabetic:]];
189     * </pre>
190     * <p>
191     * Caveats: at this point the parsing is simple; for example, # cannot be
192     * quoted (use \\u0023); you can set it to null to disable.
193     * The equality sign and a few others can be reset with
194     * setBnfX().
195     *
196     * @param lines Series of lines that represent a BNF expression. The lines contain
197     *          a series of statements that of the form x=y;. A statement can take
198     *          multiple lines, but there can't be multiple statements on a line.
199     *          A hash quotes to the end of the line.
200     * @return Pattern
201     */
202    public String compileBnf(List<String> lines) {
203        Map<String, String> variables = getVariables(lines);
204        Set<String> unused = new LinkedHashSet<String>(variables.keySet());
205        // brute force replacement; do twice to allow for different order
206        // later on can optimize
207        for (int i = 0; i < 2; ++i) {
208            for (Entry<String, String> entry : variables.entrySet()) {
209                String variable   = entry.getKey(),
210                       definition = entry.getValue();
211
212                for (Entry<String, String> entry2 : variables.entrySet()) {
213                    String variable2 = entry2.getKey(),
214                           definition2 = entry2.getValue();
215                    if (variable.equals(variable2)) {
216                        continue;
217                    }
218                    String altered2 = definition2.replace(variable, definition);
219                    if (!altered2.equals(definition2)) {
220                        unused.remove(variable);
221                        variables.put(variable2, altered2);
222                        if (log != null) {
223                            try {
224                                log.append(variable2 + "=" + altered2 + ";");
225                            } catch (IOException e) {
226                                throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
227                            }
228                        }
229                    }
230                }
231            }
232        }
233        if (unused.size() != 1) {
234            throw new IllegalArgumentException("Not a single root: " + unused);
235        }
236        return variables.get(unused.iterator().next());
237    }
238
239    public String getBnfCommentString() {
240        return bnfCommentString;
241    }
242
243    public void setBnfCommentString(String bnfCommentString) {
244        this.bnfCommentString = bnfCommentString;
245    }
246
247    public String getBnfVariableInfix() {
248        return bnfVariableInfix;
249    }
250
251    public void setBnfVariableInfix(String bnfVariableInfix) {
252        this.bnfVariableInfix = bnfVariableInfix;
253    }
254
255    public String getBnfLineSeparator() {
256        return bnfLineSeparator;
257    }
258
259    public void setBnfLineSeparator(String bnfLineSeparator) {
260        this.bnfLineSeparator = bnfLineSeparator;
261    }
262
263    /**
264     * Utility for loading lines from a file.
265     * @param result The result of the appended lines.
266     * @param file The file to have an input stream.
267     * @param encoding if null, then UTF-8
268     * @return filled list
269     * @throws IOException If there were problems opening the file for input stream.
270     */
271    public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
272        InputStream is = new FileInputStream(file);
273        try {
274            return appendLines(result, is, encoding);
275        } finally {
276            is.close();
277        }
278    }
279
280    /**
281     * Utility for loading lines from a UTF8 file.
282     * @param result The result of the appended lines.
283     * @param inputStream The input stream.
284     * @param encoding if null, then UTF-8
285     * @return filled list
286     * @throws IOException  If there were problems opening the input stream for reading.
287     */
288    public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
289            throws UnsupportedEncodingException, IOException {
290        BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
291        while (true) {
292            String line = in.readLine();
293            if (line == null) break;
294            result.add(line);
295        }
296        return result;
297    }
298
299
300
301    /* (non-Javadoc)
302     * @see com.ibm.icu.util.Freezable#cloneAsThawed()
303     */
304    public UnicodeRegex cloneAsThawed() {
305        // TODO Auto-generated method stub
306        try {
307            return (UnicodeRegex)clone();
308        } catch (CloneNotSupportedException e) {
309            throw new IllegalArgumentException(); // should never happen
310        }
311    }
312
313    /* (non-Javadoc)
314     * @see com.ibm.icu.util.Freezable#freeze()
315     */
316    public UnicodeRegex freeze() {
317        // no action needed now.
318        return this;
319    }
320
321    /* (non-Javadoc)
322     * @see com.ibm.icu.util.Freezable#isFrozen()
323     */
324    public boolean isFrozen() {
325        // at this point, always true
326        return true;
327    }
328
329    // ===== PRIVATES =====
330
331    private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
332        try {
333            pos.setIndex(i);
334            UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
335            x.complement().complement(); // hack to fix toPattern
336            result.append(x.toPattern(false));
337            i = pos.getIndex() - 1; // allow for the loop increment
338            return i;
339        } catch (Exception e) {
340            throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
341        }
342    }
343
344    private static UnicodeRegex STANDARD = new UnicodeRegex();
345    private String bnfCommentString = "#";
346    private String bnfVariableInfix = "=";
347    private String bnfLineSeparator = "\n";
348    private Appendable log = null;
349
350    private Comparator<Object> LongestFirst = new Comparator<Object>() {
351        public int compare(Object obj0, Object obj1) {
352            String arg0 = obj0.toString();
353            String arg1 = obj1.toString();
354            int len0 = arg0.length();
355            int len1 = arg1.length();
356            if (len0 != len1) return len1 - len0;
357            return arg0.compareTo(arg1);
358        }
359    };
360
361    private Map<String, String> getVariables(List<String> lines) {
362        Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
363        String variable = null;
364        StringBuffer definition = new StringBuffer();
365        int count = 0;
366        for (String line : lines) {
367            ++count;
368            // remove initial bom, comments
369            if (line.length() == 0) continue;
370            if (line.charAt(0) == '\uFEFF') line = line.substring(1);
371
372            if (bnfCommentString != null) {
373                int hashPos = line.indexOf(bnfCommentString);
374                if (hashPos >= 0) line = line.substring(0, hashPos);
375            }
376            String trimline = line.trim();
377            if (trimline.length() == 0) continue;
378
379            // String[] lineParts = line.split(";");
380            String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
381            if (linePart.trim().length() == 0) continue;
382            boolean terminated = trimline.endsWith(";");
383            if (terminated) {
384                linePart = linePart.substring(0,linePart.lastIndexOf(';'));
385            }
386            int equalsPos = linePart.indexOf(bnfVariableInfix);
387            if (equalsPos >= 0) {
388                if (variable != null) {
389                    throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
390                }
391                variable = linePart.substring(0,equalsPos).trim();
392                if (variables.containsKey(variable)) {
393                    throw new IllegalArgumentException("Duplicate variable definition in " + line);
394                }
395                definition.append(linePart.substring(equalsPos+1).trim());
396            } else { // no equals, so
397                if (variable == null) {
398                    throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
399                }
400                definition.append(bnfLineSeparator).append(linePart);
401            }
402            // we are terminated if i is not at the end, or the line ends with a ;
403            if (terminated) {
404                variables.put(variable, definition.toString());
405                variable = null; // signal we have no variable
406                definition.setLength(0);
407            }
408        }
409        if (variable != null) {
410            throw new IllegalArgumentException("Missing ';' at end");
411        }
412        return variables;
413    }
414}
415