1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2009-2015, Google, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *******************************************************************************
8 */
9package com.ibm.icu.impl;
10
11import java.io.BufferedReader;
12import java.io.FileInputStream;
13import java.io.IOException;
14import java.io.InputStream;
15import java.io.InputStreamReader;
16import java.io.UnsupportedEncodingException;
17import java.text.ParsePosition;
18import java.util.Arrays;
19import java.util.Comparator;
20import java.util.LinkedHashSet;
21import java.util.List;
22import java.util.Map;
23import java.util.Map.Entry;
24import java.util.Set;
25import java.util.TreeMap;
26import java.util.regex.Pattern;
27
28import com.ibm.icu.text.StringTransform;
29import com.ibm.icu.text.SymbolTable;
30import com.ibm.icu.text.UnicodeSet;
31import com.ibm.icu.util.Freezable;
32
33/**
34 * Contains utilities to supplement the JDK Regex, since it doesn't handle
35 * Unicode well.
36 *
37 * <p>TODO: Move to com.ibm.icu.dev.somewhere.
38 * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools.
39 *
40 * @author markdavis
41 */
42public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
43    // Note: we don't currently have any state, but intend to in the future,
44    // particularly for the regex style supported.
45
46    private SymbolTable symbolTable;
47
48    /**
49     * Set the symbol table for internal processing
50     * @internal
51     */
52    public SymbolTable getSymbolTable() {
53        return symbolTable;
54    }
55
56    /**
57     * Get the symbol table for internal processing
58     * @internal
59     */
60    public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
61        this.symbolTable = symbolTable;
62        return this;
63    }
64
65    /**
66     * Adds full Unicode property support, with the latest version of Unicode,
67     * to Java Regex, bringing it up to Level 1 (see
68     * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
69     * regex pattern string and interpreting the character classes (\p{...},
70     * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
71     * this utility, Java regex expressions can be updated to work with the
72     * latest version of Unicode, and with all Unicode properties. Note that the
73     * UnicodeSet syntax has not yet, however, been updated to be completely
74     * consistent with Java regex, so be careful of the differences.
75     * <p>Not thread-safe; create a separate copy for different threads.
76     * <p>In the future, we may extend this to support other regex packages.
77     *
78     * @regex A modified Java regex pattern, as in the input to
79     *        Pattern.compile(), except that all "character classes" are
80     *        processed as if they were UnicodeSet patterns. Example:
81     *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
82     * @return A processed Java regex pattern, suitable for input to
83     *         Pattern.compile().
84     */
85    @Override
86    public String transform(String regex) {
87        StringBuilder result = new StringBuilder();
88        UnicodeSet temp = new UnicodeSet();
89        ParsePosition pos = new ParsePosition(0);
90        int state = 0; // 1 = after \
91
92        // We add each character unmodified to the output, unless we have a
93        // UnicodeSet. Note that we don't worry about supplementary characters,
94        // since none of the syntax uses them.
95
96        for (int i = 0; i < regex.length(); ++i) {
97            // look for UnicodeSets, allowing for quoting with \ and \Q
98            char ch = regex.charAt(i);
99            switch (state) {
100            case 0: // we only care about \, and '['.
101                if (ch == '\\') {
102                    if (UnicodeSet.resemblesPattern(regex, i)) {
103                        // should only happen with \p
104                        i = processSet(regex, i, result, temp, pos);
105                        continue;
106                    }
107                    state = 1;
108                } else if (ch == '[') {
109                    // if we have what looks like a UnicodeSet
110                    if (UnicodeSet.resemblesPattern(regex, i)) {
111                        i = processSet(regex, i, result, temp, pos);
112                        continue;
113                    }
114                }
115                break;
116
117            case 1: // we are after a \
118                if (ch == 'Q') {
119                    state = 1;
120                } else {
121                    state = 0;
122                }
123                break;
124
125            case 2: // we are in a \Q...
126                if (ch == '\\') {
127                    state = 3;
128                }
129                break;
130
131            case 3: // we are in at \Q...\
132                if (ch == 'E') {
133                    state = 0;
134                }
135                state = 2;
136                break;
137            }
138            result.append(ch);
139        }
140        return result.toString();
141    }
142
143    /**
144     * Convenience static function, using standard parameters.
145     * @param regex as in process()
146     * @return processed regex pattern, as in process()
147     */
148    public static String fix(String regex) {
149        return STANDARD.transform(regex);
150    }
151
152    /**
153     * Compile a regex string, after processing by fix(...).
154     *
155     * @param regex Raw regex pattern, as in fix(...).
156     * @return Pattern
157     */
158    public static Pattern compile(String regex) {
159        return Pattern.compile(STANDARD.transform(regex));
160    }
161
162    /**
163     * Compile a regex string, after processing by fix(...).
164     *
165     * @param regex Raw regex pattern, as in fix(...).
166     * @return Pattern
167     */
168    public static Pattern compile(String regex, int options) {
169        return Pattern.compile(STANDARD.transform(regex), options);
170    }
171
172    /**
173     * Compile a composed string from a set of BNF lines; see the List version for more information.
174     *
175     * @param bnfLines Series of BNF lines.
176     * @return Pattern
177     */
178    public String compileBnf(String bnfLines) {
179        return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
180    }
181
182    /**
183     * Compile a composed string from a set of BNF lines, such as for composing a regex
184     * expression. The lines can be in any order, but there must not be any
185     * cycles. The result can be used as input for fix().
186     * <p>
187     * Example:
188     * <pre>
189     * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
190     * scheme = reserved+;
191     * host = // reserved+;
192     * query = [\\=reserved]+;
193     * fragment = reserved+;
194     * reserved = [[:ascii:][:alphabetic:]];
195     * </pre>
196     * <p>
197     * Caveats: at this point the parsing is simple; for example, # cannot be
198     * quoted (use \\u0023); you can set it to null to disable.
199     * The equality sign and a few others can be reset with
200     * setBnfX().
201     *
202     * @param lines Series of lines that represent a BNF expression. The lines contain
203     *          a series of statements that of the form x=y;. A statement can take
204     *          multiple lines, but there can't be multiple statements on a line.
205     *          A hash quotes to the end of the line.
206     * @return Pattern
207     */
208    public String compileBnf(List<String> lines) {
209        Map<String, String> variables = getVariables(lines);
210        Set<String> unused = new LinkedHashSet<String>(variables.keySet());
211        // brute force replacement; do twice to allow for different order
212        // later on can optimize
213        for (int i = 0; i < 2; ++i) {
214            for (Entry<String, String> entry : variables.entrySet()) {
215                String variable   = entry.getKey(),
216                       definition = entry.getValue();
217
218                for (Entry<String, String> entry2 : variables.entrySet()) {
219                    String variable2 = entry2.getKey(),
220                           definition2 = entry2.getValue();
221                    if (variable.equals(variable2)) {
222                        continue;
223                    }
224                    String altered2 = definition2.replace(variable, definition);
225                    if (!altered2.equals(definition2)) {
226                        unused.remove(variable);
227                        variables.put(variable2, altered2);
228//                        if (log != null) {
229//                            try {
230//                                log.append(variable2 + "=" + altered2 + ";");
231//                            } catch (IOException e) {
232//                                throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
233//                            }
234//                        }
235                    }
236                }
237            }
238        }
239        if (unused.size() != 1) {
240            throw new IllegalArgumentException("Not a single root: " + unused);
241        }
242        return variables.get(unused.iterator().next());
243    }
244
245    public String getBnfCommentString() {
246        return bnfCommentString;
247    }
248
249    public void setBnfCommentString(String bnfCommentString) {
250        this.bnfCommentString = bnfCommentString;
251    }
252
253    public String getBnfVariableInfix() {
254        return bnfVariableInfix;
255    }
256
257    public void setBnfVariableInfix(String bnfVariableInfix) {
258        this.bnfVariableInfix = bnfVariableInfix;
259    }
260
261    public String getBnfLineSeparator() {
262        return bnfLineSeparator;
263    }
264
265    public void setBnfLineSeparator(String bnfLineSeparator) {
266        this.bnfLineSeparator = bnfLineSeparator;
267    }
268
269    /**
270     * Utility for loading lines from a file.
271     * @param result The result of the appended lines.
272     * @param file The file to have an input stream.
273     * @param encoding if null, then UTF-8
274     * @return filled list
275     * @throws IOException If there were problems opening the file for input stream.
276     */
277    public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
278        InputStream is = new FileInputStream(file);
279        try {
280            return appendLines(result, is, encoding);
281        } finally {
282            is.close();
283        }
284    }
285
286    /**
287     * Utility for loading lines from a UTF8 file.
288     * @param result The result of the appended lines.
289     * @param inputStream The input stream.
290     * @param encoding if null, then UTF-8
291     * @return filled list
292     * @throws IOException  If there were problems opening the input stream for reading.
293     */
294    public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
295            throws UnsupportedEncodingException, IOException {
296        BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
297        while (true) {
298            String line = in.readLine();
299            if (line == null) break;
300            result.add(line);
301        }
302        return result;
303    }
304
305
306
307    /* (non-Javadoc)
308     * @see com.ibm.icu.util.Freezable#cloneAsThawed()
309     */
310    @Override
311    public UnicodeRegex cloneAsThawed() {
312        // TODO Auto-generated method stub
313        try {
314            return (UnicodeRegex)clone();
315        } catch (CloneNotSupportedException e) {
316            throw new IllegalArgumentException(); // should never happen
317        }
318    }
319
320    /* (non-Javadoc)
321     * @see com.ibm.icu.util.Freezable#freeze()
322     */
323    @Override
324    public UnicodeRegex freeze() {
325        // no action needed now.
326        return this;
327    }
328
329    /* (non-Javadoc)
330     * @see com.ibm.icu.util.Freezable#isFrozen()
331     */
332    @Override
333    public boolean isFrozen() {
334        // at this point, always true
335        return true;
336    }
337
338    // ===== PRIVATES =====
339
340    private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
341        try {
342            pos.setIndex(i);
343            UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
344            x.complement().complement(); // hack to fix toPattern
345            result.append(x.toPattern(false));
346            i = pos.getIndex() - 1; // allow for the loop increment
347            return i;
348        } catch (Exception e) {
349            throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
350        }
351    }
352
353    private static final UnicodeRegex STANDARD = new UnicodeRegex();
354    private String bnfCommentString = "#";
355    private String bnfVariableInfix = "=";
356    private String bnfLineSeparator = "\n";
357//    private Appendable log = null;
358
359    private Comparator<Object> LongestFirst = new Comparator<Object>() {
360        @Override
361        public int compare(Object obj0, Object obj1) {
362            String arg0 = obj0.toString();
363            String arg1 = obj1.toString();
364            int len0 = arg0.length();
365            int len1 = arg1.length();
366            if (len0 != len1) return len1 - len0;
367            return arg0.compareTo(arg1);
368        }
369    };
370
371    private Map<String, String> getVariables(List<String> lines) {
372        Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
373        String variable = null;
374        StringBuffer definition = new StringBuffer();
375        int count = 0;
376        for (String line : lines) {
377            ++count;
378            // remove initial bom, comments
379            if (line.length() == 0) continue;
380            if (line.charAt(0) == '\uFEFF') line = line.substring(1);
381
382            if (bnfCommentString != null) {
383                int hashPos = line.indexOf(bnfCommentString);
384                if (hashPos >= 0) line = line.substring(0, hashPos);
385            }
386            String trimline = line.trim();
387            if (trimline.length() == 0) continue;
388
389            // String[] lineParts = line.split(";");
390            String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
391            if (linePart.trim().length() == 0) continue;
392            boolean terminated = trimline.endsWith(";");
393            if (terminated) {
394                linePart = linePart.substring(0,linePart.lastIndexOf(';'));
395            }
396            int equalsPos = linePart.indexOf(bnfVariableInfix);
397            if (equalsPos >= 0) {
398                if (variable != null) {
399                    throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
400                }
401                variable = linePart.substring(0,equalsPos).trim();
402                if (variables.containsKey(variable)) {
403                    throw new IllegalArgumentException("Duplicate variable definition in " + line);
404                }
405                definition.append(linePart.substring(equalsPos+1).trim());
406            } else { // no equals, so
407                if (variable == null) {
408                    throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
409                }
410                definition.append(bnfLineSeparator).append(linePart);
411            }
412            // we are terminated if i is not at the end, or the line ends with a ;
413            if (terminated) {
414                variables.put(variable, definition.toString());
415                variable = null; // signal we have no variable
416                definition.setLength(0);
417            }
418        }
419        if (variable != null) {
420            throw new IllegalArgumentException("Missing ';' at end");
421        }
422        return variables;
423    }
424}
425