1/* 2 ******************************************************************************* 3 * Copyright (C) 2009-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7package com.ibm.icu.dev.test.translit; 8 9import java.util.ArrayList; 10import java.util.List; 11import java.util.regex.Matcher; 12import java.util.regex.Pattern; 13 14import com.ibm.icu.dev.test.TestFmwk; 15import com.ibm.icu.impl.UnicodeRegex; 16import com.ibm.icu.lang.UCharacter; 17import com.ibm.icu.lang.UProperty; 18import com.ibm.icu.lang.UProperty.NameChoice; 19import com.ibm.icu.text.Transliterator; 20import com.ibm.icu.text.UTF16; 21import com.ibm.icu.text.UnicodeSet; 22 23/** 24 * @author markdavis 25 */ 26public class RegexUtilitiesTest extends TestFmwk { 27 28 public static void main(String[] args) throws Exception { 29 new RegexUtilitiesTest().run(args); 30 } 31 32 /** 33 * Check basic construction. 34 */ 35 public void TestConstruction() { 36 String[][] tests = { 37 {"a"}, 38 {"a[a-z]b"}, 39 {"[ba-z]", "[a-z]"}, 40 {"q[ba-z]", "q[a-z]"}, 41 {"[ba-z]q", "[a-z]q"}, 42 {"a\\p{joincontrol}b", "a[\u200C\u200D]b"}, 43 {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"}, 44 {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"}, 45 {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"}, 46 }; 47 for (int i = 0; i < tests.length; ++i) { 48 final String source = tests[i][0]; 49 String expected = tests[i].length == 1 ? source : tests[i][1]; 50 String actual = UnicodeRegex.fix(source); 51 assertEquals(source, expected, actual); 52 } 53 } 54 55 Transliterator hex = Transliterator.getInstance("hex"); 56 57 /** 58 * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each 59 * character works. 60 */ 61 public void TestCharacters() { 62 UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]"); 63 boolean skip = getInclusion() < 10; 64 for (int cp = 0; cp < 0x110000; ++cp) { 65 if (cp > 0xFF && skip && (cp % 37 != 0)) { 66 continue; 67 } 68 String cpString = UTF16.valueOf(cp); 69 String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString; 70 String pattern = null; 71 final String rawPattern = "[" + s + s + "]"; 72 try { 73 pattern = UnicodeRegex.fix(rawPattern); 74 } catch (Exception e) { 75 errln(e.getMessage()); 76 continue; 77 } 78 final String expected = "[" + s + "]"; 79 assertEquals("Doubled character works" + hex.transform(s), expected, pattern); 80 81 // verify that we can create a regex pattern and use as expected 82 String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000); 83 checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch); 84 85 // verify that the Pattern.compile works 86 checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch); 87 } 88 } 89 90 /** 91 * Check all integer Unicode properties to make sure they work. 92 */ 93 public void TestUnicodeProperties() { 94 final boolean skip = getInclusion() < 10; 95 UnicodeSet temp = new UnicodeSet(); 96 for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) { 97 if (skip && (propNum % 5 != 0)) { 98 continue; 99 } 100 String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG); 101 final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum); 102 int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum); 103 if (skip) { // only test first if not exhaustive 104 intPropertyMaxValue = intPropertyMinValue; 105 } 106 for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) { 107 // hack for getting property value name 108 String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG); 109 if (valueName == null) { 110 valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT); 111 if (valueName == null) { 112 valueName = Integer.toString(valueNum); 113 } 114 } 115 temp.applyIntPropertyValue(propNum, valueNum); 116 if (temp.size() == 0) { 117 continue; 118 } 119 final String prefix = "a"; 120 final String suffix = "b"; 121 String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; 122 temp.complement(); 123 String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; 124 125 // posix style pattern 126 String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix; 127 String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix; 128 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); 129 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); 130 131 // perl style pattern 132 rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix; 133 rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix; 134 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); 135 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); 136 } 137 } 138 } 139 140 public void TestBnf() { 141 UnicodeRegex regex = new UnicodeRegex(); 142 final String[][] tests = { 143 { 144 "c = a wq;\n" + 145 "a = xyz;\n" + 146 "b = a a c;\n" 147 }, 148 { 149 "c = a b;\n" + 150 "a = xyz;\n" + 151 "b = a a c;\n", 152 "Exception" 153 }, 154 { 155 "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" + 156 "scheme = reserved+;\n" + 157 "host = // reserved+;\n" + 158 "query = [\\=reserved]+;\n" + 159 "fragment = reserved+;\n" + 160 "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n", 161 "http://\u03B1\u03B2\u03B3?huh=hi#there"}, 162 { 163 "langtagRegex.txt" 164 } 165 }; 166 for (int i = 0; i < tests.length; ++i) { 167 String test = tests[i][0]; 168 final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception"); 169 try { 170 String result; 171 if (test.endsWith(".txt")) { 172 java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test); 173 List lines; 174 try { 175 lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8"); 176 } finally { 177 is.close(); 178 } 179 result = regex.compileBnf(lines); 180 } else { 181 result = regex.compileBnf(test); 182 } 183 if (expectException) { 184 errln("Expected exception for " + test); 185 continue; 186 } 187 result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff 188 String resolved = regex.transform(result); 189 logln(resolved); 190 Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher(""); 191 String checks = ""; 192 for (int j = 1; j < tests[i].length; ++j) { 193 String check = tests[i][j]; 194 if (!m.reset(check).matches()) { 195 checks = checks + "Fails " + check + "\n"; 196 } else { 197 for (int k = 1; k <= m.groupCount(); ++k) { 198 checks += "(" + m.group(k) + ")"; 199 } 200 checks += "\n"; 201 } 202 } 203 logln("Result: " + result + "\n" + checks + "\n" + test); 204 } catch (Exception e) { 205 if (!expectException) { 206 errln(e.getClass().getName() + ": " + e.getMessage()); 207 } 208 continue; 209 } 210 } 211 } 212 213 /** 214 * Utility for checking patterns 215 */ 216 private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) { 217 Matcher matcher = pat.matcher(shouldMatch); 218 assertTrue(matchTitle + " and " + shouldMatch, matcher.matches()); 219 matcher.reset(shouldNotMatch); 220 assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches()); 221 } 222} 223