1/*
2 *******************************************************************************
3 * Copyright (C) 2009-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7package com.ibm.icu.dev.test.translit;
8
9import java.util.ArrayList;
10import java.util.List;
11import java.util.regex.Matcher;
12import java.util.regex.Pattern;
13
14import com.ibm.icu.dev.test.TestFmwk;
15import com.ibm.icu.impl.UnicodeRegex;
16import com.ibm.icu.lang.UCharacter;
17import com.ibm.icu.lang.UProperty;
18import com.ibm.icu.lang.UProperty.NameChoice;
19import com.ibm.icu.text.Transliterator;
20import com.ibm.icu.text.UTF16;
21import com.ibm.icu.text.UnicodeSet;
22
23/**
24 * @author markdavis
25 */
26public class RegexUtilitiesTest extends TestFmwk {
27
28    public static void main(String[] args) throws Exception {
29        new RegexUtilitiesTest().run(args);
30    }
31
32    /**
33     * Check basic construction.
34     */
35    public void TestConstruction() {
36        String[][] tests = {
37                {"a"},
38                {"a[a-z]b"},
39                {"[ba-z]", "[a-z]"},
40                {"q[ba-z]", "q[a-z]"},
41                {"[ba-z]q", "[a-z]q"},
42                {"a\\p{joincontrol}b", "a[\u200C\u200D]b"},
43                {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"},
44                {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"},
45                {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"},
46        };
47        for (int i = 0; i < tests.length; ++i) {
48            final String source = tests[i][0];
49            String expected = tests[i].length == 1 ? source : tests[i][1];
50            String actual = UnicodeRegex.fix(source);
51            assertEquals(source, expected, actual);
52        }
53    }
54
55    Transliterator hex = Transliterator.getInstance("hex");
56
57    /**
58     * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each
59     * character works.
60     */
61    public void TestCharacters() {
62        UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
63        boolean skip = getInclusion() < 10;
64        for (int cp = 0; cp < 0x110000; ++cp) {
65            if (cp > 0xFF && skip && (cp % 37 != 0)) {
66                continue;
67            }
68            String cpString = UTF16.valueOf(cp);
69            String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString;
70            String pattern = null;
71            final String rawPattern = "[" + s + s + "]";
72            try {
73                pattern = UnicodeRegex.fix(rawPattern);
74            } catch (Exception e) {
75                errln(e.getMessage());
76                continue;
77            }
78            final String expected = "[" + s + "]";
79            assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
80
81            // verify that we can create a regex pattern and use as expected
82            String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);
83            checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch);
84
85            // verify that the Pattern.compile works
86            checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch);
87        }
88    }
89
90    /**
91     * Check all integer Unicode properties to make sure they work.
92     */
93    public void TestUnicodeProperties() {
94        final boolean skip = getInclusion() < 10;
95        UnicodeSet temp = new UnicodeSet();
96        for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) {
97            if (skip && (propNum % 5 != 0)) {
98                continue;
99            }
100            String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG);
101            final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum);
102            int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum);
103            if (skip) { // only test first if not exhaustive
104                intPropertyMaxValue = intPropertyMinValue;
105            }
106            for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) {
107                // hack for getting property value name
108                String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG);
109                if (valueName == null) {
110                    valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT);
111                    if (valueName == null) {
112                        valueName = Integer.toString(valueNum);
113                    }
114                }
115                temp.applyIntPropertyValue(propNum, valueNum);
116                if (temp.size() == 0) {
117                    continue;
118                }
119                final String prefix = "a";
120                final String suffix = "b";
121                String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
122                temp.complement();
123                String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
124
125                // posix style pattern
126                String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix;
127                String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix;
128                checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
129                checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
130
131                // perl style pattern
132                rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix;
133                rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix;
134                checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
135                checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
136            }
137        }
138    }
139
140    public void TestBnf() {
141        UnicodeRegex regex = new UnicodeRegex();
142        final String[][] tests = {
143                {
144                    "c = a wq;\n" +
145                    "a = xyz;\n" +
146                    "b = a a c;\n"
147                },
148                {
149                    "c = a b;\n" +
150                    "a = xyz;\n" +
151                    "b = a a c;\n",
152                    "Exception"
153                },
154                {
155                    "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" +
156                    "scheme = reserved+;\n" +
157                    "host = // reserved+;\n" +
158                    "query = [\\=reserved]+;\n" +
159                    "fragment = reserved+;\n" +
160                    "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n",
161                "http://\u03B1\u03B2\u03B3?huh=hi#there"},
162                {
163                    "langtagRegex.txt"
164                }
165        };
166        for (int i = 0; i < tests.length; ++i) {
167            String test = tests[i][0];
168            final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception");
169            try {
170                String result;
171                if (test.endsWith(".txt")) {
172                    java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test);
173                    List lines;
174                    try {
175                        lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8");
176                    } finally {
177                        is.close();
178                    }
179                    result = regex.compileBnf(lines);
180                } else {
181                    result = regex.compileBnf(test);
182                }
183                if (expectException) {
184                    errln("Expected exception for " + test);
185                    continue;
186                }
187                result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff
188                String resolved = regex.transform(result);
189                logln(resolved);
190                Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher("");
191                String checks = "";
192                for (int j = 1; j < tests[i].length; ++j) {
193                    String check = tests[i][j];
194                    if (!m.reset(check).matches()) {
195                        checks = checks + "Fails " + check + "\n";
196                    } else {
197                        for (int k = 1; k <= m.groupCount(); ++k) {
198                            checks += "(" + m.group(k) + ")";
199                        }
200                        checks += "\n";
201                    }
202                }
203                logln("Result: " + result + "\n" + checks + "\n" + test);
204            } catch (Exception e) {
205                if (!expectException) {
206                    errln(e.getClass().getName() + ": " + e.getMessage());
207                }
208                continue;
209            }
210        }
211    }
212
213    /**
214     * Utility for checking patterns
215     */
216    private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) {
217        Matcher matcher = pat.matcher(shouldMatch);
218        assertTrue(matchTitle + " and " + shouldMatch, matcher.matches());
219        matcher.reset(shouldNotMatch);
220        assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches());
221    }
222}
223