test-regexp.cc revision 592a9fc1d8ea420377a2e7efd0600e20b058be2b
1// Copyright 2008 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 29#include <stdlib.h> 30 31#include "v8.h" 32 33#include "ast.h" 34#include "char-predicates-inl.h" 35#include "cctest.h" 36#include "jsregexp.h" 37#include "parser.h" 38#include "regexp-macro-assembler.h" 39#include "regexp-macro-assembler-irregexp.h" 40#include "string-stream.h" 41#include "zone-inl.h" 42#ifdef V8_INTERPRETED_REGEXP 43#include "interpreter-irregexp.h" 44#else // V8_INTERPRETED_REGEXP 45#include "macro-assembler.h" 46#include "code.h" 47#ifdef V8_TARGET_ARCH_ARM 48#include "arm/assembler-arm.h" 49#include "arm/macro-assembler-arm.h" 50#include "arm/regexp-macro-assembler-arm.h" 51#endif 52#ifdef V8_TARGET_ARCH_MIPS 53#include "mips/assembler-mips.h" 54#include "mips/macro-assembler-mips.h" 55#include "mips/regexp-macro-assembler-mips.h" 56#endif 57#ifdef V8_TARGET_ARCH_X64 58#include "x64/assembler-x64.h" 59#include "x64/macro-assembler-x64.h" 60#include "x64/regexp-macro-assembler-x64.h" 61#endif 62#ifdef V8_TARGET_ARCH_IA32 63#include "ia32/assembler-ia32.h" 64#include "ia32/macro-assembler-ia32.h" 65#include "ia32/regexp-macro-assembler-ia32.h" 66#endif 67#endif // V8_INTERPRETED_REGEXP 68 69using namespace v8::internal; 70 71 72static bool CheckParse(const char* input) { 73 V8::Initialize(NULL); 74 v8::HandleScope scope; 75 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 76 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 77 RegExpCompileData result; 78 return v8::internal::RegExpParser::ParseRegExp(&reader, false, &result); 79} 80 81 82static SmartArrayPointer<const char> Parse(const char* input) { 83 V8::Initialize(NULL); 84 v8::HandleScope scope; 85 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 86 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 87 RegExpCompileData result; 88 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 89 CHECK(result.tree != NULL); 90 CHECK(result.error.is_null()); 91 SmartArrayPointer<const char> output = result.tree->ToString(); 92 return output; 93} 94 95static bool CheckSimple(const char* input) { 96 V8::Initialize(NULL); 97 v8::HandleScope scope; 98 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input)); 99 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 100 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 101 RegExpCompileData result; 102 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 103 CHECK(result.tree != NULL); 104 CHECK(result.error.is_null()); 105 return result.simple; 106} 107 108struct MinMaxPair { 109 int min_match; 110 int max_match; 111}; 112 113static MinMaxPair CheckMinMaxMatch(const char* input) { 114 V8::Initialize(NULL); 115 v8::HandleScope scope; 116 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input)); 117 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 118 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 119 RegExpCompileData result; 120 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 121 CHECK(result.tree != NULL); 122 CHECK(result.error.is_null()); 123 int min_match = result.tree->min_match(); 124 int max_match = result.tree->max_match(); 125 MinMaxPair pair = { min_match, max_match }; 126 return pair; 127} 128 129 130#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input)) 131#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input)) 132#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input)); 133#define CHECK_MIN_MAX(input, min, max) \ 134 { MinMaxPair min_max = CheckMinMaxMatch(input); \ 135 CHECK_EQ(min, min_max.min_match); \ 136 CHECK_EQ(max, min_max.max_match); \ 137 } 138 139TEST(Parser) { 140 V8::Initialize(NULL); 141 142 CHECK_PARSE_ERROR("?"); 143 144 CHECK_PARSE_EQ("abc", "'abc'"); 145 CHECK_PARSE_EQ("", "%"); 146 CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')"); 147 CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')"); 148 CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)"); 149 CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')"); 150 CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])"); 151 CHECK_PARSE_EQ("a*", "(# 0 - g 'a')"); 152 CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')"); 153 CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))"); 154 CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))"); 155 CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))"); 156 CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))"); 157 CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))"); 158 CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))"); 159 CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))"); 160 CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))"); 161 CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))"); 162 CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))"); 163 CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))"); 164 CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))"); 165 CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'"); 166 CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')"); 167 CHECK_PARSE_EQ("(?:foo)", "'foo'"); 168 CHECK_PARSE_EQ("(?: foo )", "' foo '"); 169 CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))"); 170 CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')"); 171 CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')"); 172 CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')"); 173 CHECK_PARSE_EQ("()", "(^ %)"); 174 CHECK_PARSE_EQ("(?=)", "(-> + %)"); 175 CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows 176 CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252 177 CHECK_PARSE_EQ("[x]", "[x]"); 178 CHECK_PARSE_EQ("[xyz]", "[x y z]"); 179 CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]"); 180 CHECK_PARSE_EQ("[-123]", "[- 1 2 3]"); 181 CHECK_PARSE_EQ("[^123]", "^[1 2 3]"); 182 CHECK_PARSE_EQ("]", "']'"); 183 CHECK_PARSE_EQ("}", "'}'"); 184 CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]"); 185 CHECK_PARSE_EQ("[\\d]", "[0-9]"); 186 CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]"); 187 CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]"); 188 CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]"); 189 CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]"); 190 // Control character outside character class. 191 CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK", 192 "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'"); 193 CHECK_PARSE_EQ("\\c!", "'\\c!'"); 194 CHECK_PARSE_EQ("\\c_", "'\\c_'"); 195 CHECK_PARSE_EQ("\\c~", "'\\c~'"); 196 CHECK_PARSE_EQ("\\c1", "'\\c1'"); 197 // Control character inside character class. 198 CHECK_PARSE_EQ("[\\c!]", "[\\ c !]"); 199 CHECK_PARSE_EQ("[\\c_]", "[\\x1f]"); 200 CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]"); 201 CHECK_PARSE_EQ("[\\ca]", "[\\x01]"); 202 CHECK_PARSE_EQ("[\\cz]", "[\\x1a]"); 203 CHECK_PARSE_EQ("[\\cA]", "[\\x01]"); 204 CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]"); 205 CHECK_PARSE_EQ("[\\c1]", "[\\x11]"); 206 207 CHECK_PARSE_EQ("[a\\]c]", "[a ] c]"); 208 CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '"); 209 CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]"); 210 CHECK_PARSE_EQ("\\0", "'\\x00'"); 211 CHECK_PARSE_EQ("\\8", "'8'"); 212 CHECK_PARSE_EQ("\\9", "'9'"); 213 CHECK_PARSE_EQ("\\11", "'\\x09'"); 214 CHECK_PARSE_EQ("\\11a", "'\\x09a'"); 215 CHECK_PARSE_EQ("\\011", "'\\x09'"); 216 CHECK_PARSE_EQ("\\00011", "'\\x0011'"); 217 CHECK_PARSE_EQ("\\118", "'\\x098'"); 218 CHECK_PARSE_EQ("\\111", "'I'"); 219 CHECK_PARSE_EQ("\\1111", "'I1'"); 220 CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))"); 221 CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))"); 222 CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))"); 223 CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')"); 224 CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')" 225 " (# 0 - g (<- 1)))"); 226 CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')" 227 " (# 0 - g (<- 2)))"); 228 CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')" 229 " (# 0 - g (<- 3)))"); 230 CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')" 231 " (# 0 - g '\\x04'))"); 232 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10", 233 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 234 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))"); 235 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11", 236 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 237 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')"); 238 CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))"); 239 CHECK_PARSE_EQ("(a\\1)", "(^ 'a')"); 240 CHECK_PARSE_EQ("(\\1a)", "(^ 'a')"); 241 CHECK_PARSE_EQ("(?=a)?a", "'a'"); 242 CHECK_PARSE_EQ("(?=a){0,10}a", "'a'"); 243 CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')"); 244 CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')"); 245 CHECK_PARSE_EQ("(?!a)?a", "'a'"); 246 CHECK_PARSE_EQ("\\1(a)", "(^ 'a')"); 247 CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))"); 248 CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))"); 249 CHECK_PARSE_EQ("[\\0]", "[\\x00]"); 250 CHECK_PARSE_EQ("[\\11]", "[\\x09]"); 251 CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]"); 252 CHECK_PARSE_EQ("[\\011]", "[\\x09]"); 253 CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]"); 254 CHECK_PARSE_EQ("[\\118]", "[\\x09 8]"); 255 CHECK_PARSE_EQ("[\\111]", "[I]"); 256 CHECK_PARSE_EQ("[\\1111]", "[I 1]"); 257 CHECK_PARSE_EQ("\\x34", "'\x34'"); 258 CHECK_PARSE_EQ("\\x60", "'\x60'"); 259 CHECK_PARSE_EQ("\\x3z", "'x3z'"); 260 CHECK_PARSE_EQ("\\c", "'\\c'"); 261 CHECK_PARSE_EQ("\\u0034", "'\x34'"); 262 CHECK_PARSE_EQ("\\u003z", "'u003z'"); 263 CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))"); 264 265 CHECK_SIMPLE("a", true); 266 CHECK_SIMPLE("a|b", false); 267 CHECK_SIMPLE("a\\n", false); 268 CHECK_SIMPLE("^a", false); 269 CHECK_SIMPLE("a$", false); 270 CHECK_SIMPLE("a\\b!", false); 271 CHECK_SIMPLE("a\\Bb", false); 272 CHECK_SIMPLE("a*", false); 273 CHECK_SIMPLE("a*?", false); 274 CHECK_SIMPLE("a?", false); 275 CHECK_SIMPLE("a??", false); 276 CHECK_SIMPLE("a{0,1}?", false); 277 CHECK_SIMPLE("a{1,1}?", false); 278 CHECK_SIMPLE("a{1,2}?", false); 279 CHECK_SIMPLE("a+?", false); 280 CHECK_SIMPLE("(a)", false); 281 CHECK_SIMPLE("(a)\\1", false); 282 CHECK_SIMPLE("(\\1a)", false); 283 CHECK_SIMPLE("\\1(a)", false); 284 CHECK_SIMPLE("a\\s", false); 285 CHECK_SIMPLE("a\\S", false); 286 CHECK_SIMPLE("a\\d", false); 287 CHECK_SIMPLE("a\\D", false); 288 CHECK_SIMPLE("a\\w", false); 289 CHECK_SIMPLE("a\\W", false); 290 CHECK_SIMPLE("a.", false); 291 CHECK_SIMPLE("a\\q", false); 292 CHECK_SIMPLE("a[a]", false); 293 CHECK_SIMPLE("a[^a]", false); 294 CHECK_SIMPLE("a[a-z]", false); 295 CHECK_SIMPLE("a[\\q]", false); 296 CHECK_SIMPLE("a(?:b)", false); 297 CHECK_SIMPLE("a(?=b)", false); 298 CHECK_SIMPLE("a(?!b)", false); 299 CHECK_SIMPLE("\\x60", false); 300 CHECK_SIMPLE("\\u0060", false); 301 CHECK_SIMPLE("\\cA", false); 302 CHECK_SIMPLE("\\q", false); 303 CHECK_SIMPLE("\\1112", false); 304 CHECK_SIMPLE("\\0", false); 305 CHECK_SIMPLE("(a)\\1", false); 306 CHECK_SIMPLE("(?=a)?a", false); 307 CHECK_SIMPLE("(?!a)?a\\1", false); 308 CHECK_SIMPLE("(?:(?=a))a\\1", false); 309 310 CHECK_PARSE_EQ("a{}", "'a{}'"); 311 CHECK_PARSE_EQ("a{,}", "'a{,}'"); 312 CHECK_PARSE_EQ("a{", "'a{'"); 313 CHECK_PARSE_EQ("a{z}", "'a{z}'"); 314 CHECK_PARSE_EQ("a{1z}", "'a{1z}'"); 315 CHECK_PARSE_EQ("a{12z}", "'a{12z}'"); 316 CHECK_PARSE_EQ("a{12,", "'a{12,'"); 317 CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'"); 318 CHECK_PARSE_EQ("{}", "'{}'"); 319 CHECK_PARSE_EQ("{,}", "'{,}'"); 320 CHECK_PARSE_EQ("{", "'{'"); 321 CHECK_PARSE_EQ("{z}", "'{z}'"); 322 CHECK_PARSE_EQ("{1z}", "'{1z}'"); 323 CHECK_PARSE_EQ("{12z}", "'{12z}'"); 324 CHECK_PARSE_EQ("{12,", "'{12,'"); 325 CHECK_PARSE_EQ("{12,3b", "'{12,3b'"); 326 327 CHECK_MIN_MAX("a", 1, 1); 328 CHECK_MIN_MAX("abc", 3, 3); 329 CHECK_MIN_MAX("a[bc]d", 3, 3); 330 CHECK_MIN_MAX("a|bc", 1, 2); 331 CHECK_MIN_MAX("ab|c", 1, 2); 332 CHECK_MIN_MAX("a||bc", 0, 2); 333 CHECK_MIN_MAX("|", 0, 0); 334 CHECK_MIN_MAX("(?:ab)", 2, 2); 335 CHECK_MIN_MAX("(?:ab|cde)", 2, 3); 336 CHECK_MIN_MAX("(?:ab)|cde", 2, 3); 337 CHECK_MIN_MAX("(ab)", 2, 2); 338 CHECK_MIN_MAX("(ab|cde)", 2, 3); 339 CHECK_MIN_MAX("(ab)\\1", 2, 4); 340 CHECK_MIN_MAX("(ab|cde)\\1", 2, 6); 341 CHECK_MIN_MAX("(?:ab)?", 0, 2); 342 CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity); 343 CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity); 344 CHECK_MIN_MAX("a?", 0, 1); 345 CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity); 346 CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity); 347 CHECK_MIN_MAX("a??", 0, 1); 348 CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity); 349 CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity); 350 CHECK_MIN_MAX("(?:a?)?", 0, 1); 351 CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity); 352 CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity); 353 CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity); 354 CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity); 355 CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity); 356 CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity); 357 CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity); 358 CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity); 359 CHECK_MIN_MAX("a{0}", 0, 0); 360 CHECK_MIN_MAX("(?:a+){0}", 0, 0); 361 CHECK_MIN_MAX("(?:a+){0,0}", 0, 0); 362 CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity); 363 CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity); 364 CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity); 365 CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity); 366 CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity); 367 CHECK_MIN_MAX("(?:ab){4,7}", 8, 14); 368 CHECK_MIN_MAX("a\\bc", 2, 2); 369 CHECK_MIN_MAX("a\\Bc", 2, 2); 370 CHECK_MIN_MAX("a\\sc", 3, 3); 371 CHECK_MIN_MAX("a\\Sc", 3, 3); 372 CHECK_MIN_MAX("a(?=b)c", 2, 2); 373 CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2); 374 CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2); 375} 376 377TEST(ParserRegression) { 378 CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])"); 379 CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')"); 380 CHECK_PARSE_EQ("{", "'{'"); 381 CHECK_PARSE_EQ("a|", "(| 'a' %)"); 382} 383 384static void ExpectError(const char* input, 385 const char* expected) { 386 V8::Initialize(NULL); 387 v8::HandleScope scope; 388 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 389 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 390 RegExpCompileData result; 391 CHECK(!v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 392 CHECK(result.tree == NULL); 393 CHECK(!result.error.is_null()); 394 SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS); 395 CHECK_EQ(expected, *str); 396} 397 398 399TEST(Errors) { 400 V8::Initialize(NULL); 401 const char* kEndBackslash = "\\ at end of pattern"; 402 ExpectError("\\", kEndBackslash); 403 const char* kUnterminatedGroup = "Unterminated group"; 404 ExpectError("(foo", kUnterminatedGroup); 405 const char* kInvalidGroup = "Invalid group"; 406 ExpectError("(?", kInvalidGroup); 407 const char* kUnterminatedCharacterClass = "Unterminated character class"; 408 ExpectError("[", kUnterminatedCharacterClass); 409 ExpectError("[a-", kUnterminatedCharacterClass); 410 const char* kNothingToRepeat = "Nothing to repeat"; 411 ExpectError("*", kNothingToRepeat); 412 ExpectError("?", kNothingToRepeat); 413 ExpectError("+", kNothingToRepeat); 414 ExpectError("{1}", kNothingToRepeat); 415 ExpectError("{1,2}", kNothingToRepeat); 416 ExpectError("{1,}", kNothingToRepeat); 417 418 // Check that we don't allow more than kMaxCapture captures 419 const int kMaxCaptures = 1 << 16; // Must match RegExpParser::kMaxCaptures. 420 const char* kTooManyCaptures = "Too many captures"; 421 HeapStringAllocator allocator; 422 StringStream accumulator(&allocator); 423 for (int i = 0; i <= kMaxCaptures; i++) { 424 accumulator.Add("()"); 425 } 426 SmartArrayPointer<const char> many_captures(accumulator.ToCString()); 427 ExpectError(*many_captures, kTooManyCaptures); 428} 429 430 431static bool IsDigit(uc16 c) { 432 return ('0' <= c && c <= '9'); 433} 434 435 436static bool NotDigit(uc16 c) { 437 return !IsDigit(c); 438} 439 440 441static bool IsWhiteSpace(uc16 c) { 442 switch (c) { 443 case 0x09: 444 case 0x0A: 445 case 0x0B: 446 case 0x0C: 447 case 0x0d: 448 case 0x20: 449 case 0xA0: 450 case 0x2028: 451 case 0x2029: 452 return true; 453 default: 454 return unibrow::Space::Is(c); 455 } 456} 457 458 459static bool NotWhiteSpace(uc16 c) { 460 return !IsWhiteSpace(c); 461} 462 463 464static bool NotWord(uc16 c) { 465 return !IsRegExpWord(c); 466} 467 468 469static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) { 470 ZoneScope scope(Isolate::Current(), DELETE_ON_EXIT); 471 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); 472 CharacterRange::AddClassEscape(c, ranges); 473 for (unsigned i = 0; i < (1 << 16); i++) { 474 bool in_class = false; 475 for (int j = 0; !in_class && j < ranges->length(); j++) { 476 CharacterRange& range = ranges->at(j); 477 in_class = (range.from() <= i && i <= range.to()); 478 } 479 CHECK_EQ(pred(i), in_class); 480 } 481} 482 483 484TEST(CharacterClassEscapes) { 485 v8::internal::V8::Initialize(NULL); 486 TestCharacterClassEscapes('.', IsRegExpNewline); 487 TestCharacterClassEscapes('d', IsDigit); 488 TestCharacterClassEscapes('D', NotDigit); 489 TestCharacterClassEscapes('s', IsWhiteSpace); 490 TestCharacterClassEscapes('S', NotWhiteSpace); 491 TestCharacterClassEscapes('w', IsRegExpWord); 492 TestCharacterClassEscapes('W', NotWord); 493} 494 495 496static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) { 497 V8::Initialize(NULL); 498 Isolate* isolate = Isolate::Current(); 499 FlatStringReader reader(isolate, CStrVector(input)); 500 RegExpCompileData compile_data; 501 if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline, 502 &compile_data)) 503 return NULL; 504 Handle<String> pattern = isolate->factory()-> 505 NewStringFromUtf8(CStrVector(input)); 506 RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii); 507 return compile_data.node; 508} 509 510 511static void Execute(const char* input, 512 bool multiline, 513 bool is_ascii, 514 bool dot_output = false) { 515 v8::HandleScope scope; 516 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 517 RegExpNode* node = Compile(input, multiline, is_ascii); 518 USE(node); 519#ifdef DEBUG 520 if (dot_output) { 521 RegExpEngine::DotPrint(input, node, false); 522 exit(0); 523 } 524#endif // DEBUG 525} 526 527 528class TestConfig { 529 public: 530 typedef int Key; 531 typedef int Value; 532 static const int kNoKey; 533 static int NoValue() { return 0; } 534 static inline int Compare(int a, int b) { 535 if (a < b) 536 return -1; 537 else if (a > b) 538 return 1; 539 else 540 return 0; 541 } 542}; 543 544 545const int TestConfig::kNoKey = 0; 546 547 548static unsigned PseudoRandom(int i, int j) { 549 return ~(~((i * 781) ^ (j * 329))); 550} 551 552 553TEST(SplayTreeSimple) { 554 v8::internal::V8::Initialize(NULL); 555 static const unsigned kLimit = 1000; 556 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 557 ZoneSplayTree<TestConfig> tree; 558 bool seen[kLimit]; 559 for (unsigned i = 0; i < kLimit; i++) seen[i] = false; 560#define CHECK_MAPS_EQUAL() do { \ 561 for (unsigned k = 0; k < kLimit; k++) \ 562 CHECK_EQ(seen[k], tree.Find(k, &loc)); \ 563 } while (false) 564 for (int i = 0; i < 50; i++) { 565 for (int j = 0; j < 50; j++) { 566 unsigned next = PseudoRandom(i, j) % kLimit; 567 if (seen[next]) { 568 // We've already seen this one. Check the value and remove 569 // it. 570 ZoneSplayTree<TestConfig>::Locator loc; 571 CHECK(tree.Find(next, &loc)); 572 CHECK_EQ(next, loc.key()); 573 CHECK_EQ(3 * next, loc.value()); 574 tree.Remove(next); 575 seen[next] = false; 576 CHECK_MAPS_EQUAL(); 577 } else { 578 // Check that it wasn't there already and then add it. 579 ZoneSplayTree<TestConfig>::Locator loc; 580 CHECK(!tree.Find(next, &loc)); 581 CHECK(tree.Insert(next, &loc)); 582 CHECK_EQ(next, loc.key()); 583 loc.set_value(3 * next); 584 seen[next] = true; 585 CHECK_MAPS_EQUAL(); 586 } 587 int val = PseudoRandom(j, i) % kLimit; 588 if (seen[val]) { 589 ZoneSplayTree<TestConfig>::Locator loc; 590 CHECK(tree.FindGreatestLessThan(val, &loc)); 591 CHECK_EQ(loc.key(), val); 592 break; 593 } 594 val = PseudoRandom(i + j, i - j) % kLimit; 595 if (seen[val]) { 596 ZoneSplayTree<TestConfig>::Locator loc; 597 CHECK(tree.FindLeastGreaterThan(val, &loc)); 598 CHECK_EQ(loc.key(), val); 599 break; 600 } 601 } 602 } 603} 604 605 606TEST(DispatchTableConstruction) { 607 v8::internal::V8::Initialize(NULL); 608 // Initialize test data. 609 static const int kLimit = 1000; 610 static const int kRangeCount = 8; 611 static const int kRangeSize = 16; 612 uc16 ranges[kRangeCount][2 * kRangeSize]; 613 for (int i = 0; i < kRangeCount; i++) { 614 Vector<uc16> range(ranges[i], 2 * kRangeSize); 615 for (int j = 0; j < 2 * kRangeSize; j++) { 616 range[j] = PseudoRandom(i + 25, j + 87) % kLimit; 617 } 618 range.Sort(); 619 for (int j = 1; j < 2 * kRangeSize; j++) { 620 CHECK(range[j-1] <= range[j]); 621 } 622 } 623 // Enter test data into dispatch table. 624 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 625 DispatchTable table; 626 for (int i = 0; i < kRangeCount; i++) { 627 uc16* range = ranges[i]; 628 for (int j = 0; j < 2 * kRangeSize; j += 2) 629 table.AddRange(CharacterRange(range[j], range[j + 1]), i); 630 } 631 // Check that the table looks as we would expect 632 for (int p = 0; p < kLimit; p++) { 633 OutSet* outs = table.Get(p); 634 for (int j = 0; j < kRangeCount; j++) { 635 uc16* range = ranges[j]; 636 bool is_on = false; 637 for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2) 638 is_on = (range[k] <= p && p <= range[k + 1]); 639 CHECK_EQ(is_on, outs->Get(j)); 640 } 641 } 642} 643 644// Test of debug-only syntax. 645#ifdef DEBUG 646 647TEST(ParsePossessiveRepetition) { 648 bool old_flag_value = FLAG_regexp_possessive_quantifier; 649 650 // Enable possessive quantifier syntax. 651 FLAG_regexp_possessive_quantifier = true; 652 653 CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')"); 654 CHECK_PARSE_EQ("a++", "(# 1 - p 'a')"); 655 CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')"); 656 CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')"); 657 CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')"); 658 659 // Disable possessive quantifier syntax. 660 FLAG_regexp_possessive_quantifier = false; 661 662 CHECK_PARSE_ERROR("a*+"); 663 CHECK_PARSE_ERROR("a++"); 664 CHECK_PARSE_ERROR("a?+"); 665 CHECK_PARSE_ERROR("a{10,20}+"); 666 CHECK_PARSE_ERROR("a{10,20}+b"); 667 668 FLAG_regexp_possessive_quantifier = old_flag_value; 669} 670 671#endif 672 673// Tests of interpreter. 674 675 676#ifndef V8_INTERPRETED_REGEXP 677 678#if V8_TARGET_ARCH_IA32 679typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler; 680#elif V8_TARGET_ARCH_X64 681typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler; 682#elif V8_TARGET_ARCH_ARM 683typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler; 684#elif V8_TARGET_ARCH_MIPS 685typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler; 686#endif 687 688class ContextInitializer { 689 public: 690 ContextInitializer() 691 : env_(), scope_(), zone_(Isolate::Current(), DELETE_ON_EXIT) { 692 env_ = v8::Context::New(); 693 env_->Enter(); 694 } 695 ~ContextInitializer() { 696 env_->Exit(); 697 env_.Dispose(); 698 } 699 private: 700 v8::Persistent<v8::Context> env_; 701 v8::HandleScope scope_; 702 v8::internal::ZoneScope zone_; 703}; 704 705 706static ArchRegExpMacroAssembler::Result Execute(Code* code, 707 String* input, 708 int start_offset, 709 const byte* input_start, 710 const byte* input_end, 711 int* captures) { 712 return NativeRegExpMacroAssembler::Execute( 713 code, 714 input, 715 start_offset, 716 input_start, 717 input_end, 718 captures, 719 Isolate::Current()); 720} 721 722 723TEST(MacroAssemblerNativeSuccess) { 724 v8::V8::Initialize(); 725 ContextInitializer initializer; 726 Factory* factory = Isolate::Current()->factory(); 727 728 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 729 730 m.Succeed(); 731 732 Handle<String> source = factory->NewStringFromAscii(CStrVector("")); 733 Handle<Object> code_object = m.GetCode(source); 734 Handle<Code> code = Handle<Code>::cast(code_object); 735 736 int captures[4] = {42, 37, 87, 117}; 737 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 738 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 739 const byte* start_adr = 740 reinterpret_cast<const byte*>(seq_input->GetCharsAddress()); 741 742 NativeRegExpMacroAssembler::Result result = 743 Execute(*code, 744 *input, 745 0, 746 start_adr, 747 start_adr + seq_input->length(), 748 captures); 749 750 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 751 CHECK_EQ(-1, captures[0]); 752 CHECK_EQ(-1, captures[1]); 753 CHECK_EQ(-1, captures[2]); 754 CHECK_EQ(-1, captures[3]); 755} 756 757 758TEST(MacroAssemblerNativeSimple) { 759 v8::V8::Initialize(); 760 ContextInitializer initializer; 761 Factory* factory = Isolate::Current()->factory(); 762 763 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 764 765 uc16 foo_chars[3] = {'f', 'o', 'o'}; 766 Vector<const uc16> foo(foo_chars, 3); 767 768 Label fail; 769 m.CheckCharacters(foo, 0, &fail, true); 770 m.WriteCurrentPositionToRegister(0, 0); 771 m.AdvanceCurrentPosition(3); 772 m.WriteCurrentPositionToRegister(1, 0); 773 m.Succeed(); 774 m.Bind(&fail); 775 m.Fail(); 776 777 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo")); 778 Handle<Object> code_object = m.GetCode(source); 779 Handle<Code> code = Handle<Code>::cast(code_object); 780 781 int captures[4] = {42, 37, 87, 117}; 782 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 783 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 784 Address start_adr = seq_input->GetCharsAddress(); 785 786 NativeRegExpMacroAssembler::Result result = 787 Execute(*code, 788 *input, 789 0, 790 start_adr, 791 start_adr + input->length(), 792 captures); 793 794 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 795 CHECK_EQ(0, captures[0]); 796 CHECK_EQ(3, captures[1]); 797 CHECK_EQ(-1, captures[2]); 798 CHECK_EQ(-1, captures[3]); 799 800 input = factory->NewStringFromAscii(CStrVector("barbarbar")); 801 seq_input = Handle<SeqAsciiString>::cast(input); 802 start_adr = seq_input->GetCharsAddress(); 803 804 result = Execute(*code, 805 *input, 806 0, 807 start_adr, 808 start_adr + input->length(), 809 captures); 810 811 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 812} 813 814 815TEST(MacroAssemblerNativeSimpleUC16) { 816 v8::V8::Initialize(); 817 ContextInitializer initializer; 818 Factory* factory = Isolate::Current()->factory(); 819 820 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4); 821 822 uc16 foo_chars[3] = {'f', 'o', 'o'}; 823 Vector<const uc16> foo(foo_chars, 3); 824 825 Label fail; 826 m.CheckCharacters(foo, 0, &fail, true); 827 m.WriteCurrentPositionToRegister(0, 0); 828 m.AdvanceCurrentPosition(3); 829 m.WriteCurrentPositionToRegister(1, 0); 830 m.Succeed(); 831 m.Bind(&fail); 832 m.Fail(); 833 834 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo")); 835 Handle<Object> code_object = m.GetCode(source); 836 Handle<Code> code = Handle<Code>::cast(code_object); 837 838 int captures[4] = {42, 37, 87, 117}; 839 const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o', '\xa0'}; 840 Handle<String> input = 841 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 842 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 843 Address start_adr = seq_input->GetCharsAddress(); 844 845 NativeRegExpMacroAssembler::Result result = 846 Execute(*code, 847 *input, 848 0, 849 start_adr, 850 start_adr + input->length(), 851 captures); 852 853 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 854 CHECK_EQ(0, captures[0]); 855 CHECK_EQ(3, captures[1]); 856 CHECK_EQ(-1, captures[2]); 857 CHECK_EQ(-1, captures[3]); 858 859 const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', '\xa0'}; 860 input = factory->NewStringFromTwoByte(Vector<const uc16>(input_data2, 9)); 861 seq_input = Handle<SeqTwoByteString>::cast(input); 862 start_adr = seq_input->GetCharsAddress(); 863 864 result = Execute(*code, 865 *input, 866 0, 867 start_adr, 868 start_adr + input->length() * 2, 869 captures); 870 871 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 872} 873 874 875TEST(MacroAssemblerNativeBacktrack) { 876 v8::V8::Initialize(); 877 ContextInitializer initializer; 878 Factory* factory = Isolate::Current()->factory(); 879 880 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 881 882 Label fail; 883 Label backtrack; 884 m.LoadCurrentCharacter(10, &fail); 885 m.Succeed(); 886 m.Bind(&fail); 887 m.PushBacktrack(&backtrack); 888 m.LoadCurrentCharacter(10, NULL); 889 m.Succeed(); 890 m.Bind(&backtrack); 891 m.Fail(); 892 893 Handle<String> source = factory->NewStringFromAscii(CStrVector("..........")); 894 Handle<Object> code_object = m.GetCode(source); 895 Handle<Code> code = Handle<Code>::cast(code_object); 896 897 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 898 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 899 Address start_adr = seq_input->GetCharsAddress(); 900 901 NativeRegExpMacroAssembler::Result result = 902 Execute(*code, 903 *input, 904 0, 905 start_adr, 906 start_adr + input->length(), 907 NULL); 908 909 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 910} 911 912 913TEST(MacroAssemblerNativeBackReferenceASCII) { 914 v8::V8::Initialize(); 915 ContextInitializer initializer; 916 Factory* factory = Isolate::Current()->factory(); 917 918 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 919 920 m.WriteCurrentPositionToRegister(0, 0); 921 m.AdvanceCurrentPosition(2); 922 m.WriteCurrentPositionToRegister(1, 0); 923 Label nomatch; 924 m.CheckNotBackReference(0, &nomatch); 925 m.Fail(); 926 m.Bind(&nomatch); 927 m.AdvanceCurrentPosition(2); 928 Label missing_match; 929 m.CheckNotBackReference(0, &missing_match); 930 m.WriteCurrentPositionToRegister(2, 0); 931 m.Succeed(); 932 m.Bind(&missing_match); 933 m.Fail(); 934 935 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1")); 936 Handle<Object> code_object = m.GetCode(source); 937 Handle<Code> code = Handle<Code>::cast(code_object); 938 939 Handle<String> input = factory->NewStringFromAscii(CStrVector("fooofo")); 940 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 941 Address start_adr = seq_input->GetCharsAddress(); 942 943 int output[4]; 944 NativeRegExpMacroAssembler::Result result = 945 Execute(*code, 946 *input, 947 0, 948 start_adr, 949 start_adr + input->length(), 950 output); 951 952 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 953 CHECK_EQ(0, output[0]); 954 CHECK_EQ(2, output[1]); 955 CHECK_EQ(6, output[2]); 956 CHECK_EQ(-1, output[3]); 957} 958 959 960TEST(MacroAssemblerNativeBackReferenceUC16) { 961 v8::V8::Initialize(); 962 ContextInitializer initializer; 963 Factory* factory = Isolate::Current()->factory(); 964 965 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4); 966 967 m.WriteCurrentPositionToRegister(0, 0); 968 m.AdvanceCurrentPosition(2); 969 m.WriteCurrentPositionToRegister(1, 0); 970 Label nomatch; 971 m.CheckNotBackReference(0, &nomatch); 972 m.Fail(); 973 m.Bind(&nomatch); 974 m.AdvanceCurrentPosition(2); 975 Label missing_match; 976 m.CheckNotBackReference(0, &missing_match); 977 m.WriteCurrentPositionToRegister(2, 0); 978 m.Succeed(); 979 m.Bind(&missing_match); 980 m.Fail(); 981 982 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1")); 983 Handle<Object> code_object = m.GetCode(source); 984 Handle<Code> code = Handle<Code>::cast(code_object); 985 986 const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028}; 987 Handle<String> input = 988 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 989 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 990 Address start_adr = seq_input->GetCharsAddress(); 991 992 int output[4]; 993 NativeRegExpMacroAssembler::Result result = 994 Execute(*code, 995 *input, 996 0, 997 start_adr, 998 start_adr + input->length() * 2, 999 output); 1000 1001 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1002 CHECK_EQ(0, output[0]); 1003 CHECK_EQ(2, output[1]); 1004 CHECK_EQ(6, output[2]); 1005 CHECK_EQ(-1, output[3]); 1006} 1007 1008 1009 1010TEST(MacroAssemblernativeAtStart) { 1011 v8::V8::Initialize(); 1012 ContextInitializer initializer; 1013 Factory* factory = Isolate::Current()->factory(); 1014 1015 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 1016 1017 Label not_at_start, newline, fail; 1018 m.CheckNotAtStart(¬_at_start); 1019 // Check that prevchar = '\n' and current = 'f'. 1020 m.CheckCharacter('\n', &newline); 1021 m.Bind(&fail); 1022 m.Fail(); 1023 m.Bind(&newline); 1024 m.LoadCurrentCharacter(0, &fail); 1025 m.CheckNotCharacter('f', &fail); 1026 m.Succeed(); 1027 1028 m.Bind(¬_at_start); 1029 // Check that prevchar = 'o' and current = 'b'. 1030 Label prevo; 1031 m.CheckCharacter('o', &prevo); 1032 m.Fail(); 1033 m.Bind(&prevo); 1034 m.LoadCurrentCharacter(0, &fail); 1035 m.CheckNotCharacter('b', &fail); 1036 m.Succeed(); 1037 1038 Handle<String> source = factory->NewStringFromAscii(CStrVector("(^f|ob)")); 1039 Handle<Object> code_object = m.GetCode(source); 1040 Handle<Code> code = Handle<Code>::cast(code_object); 1041 1042 Handle<String> input = factory->NewStringFromAscii(CStrVector("foobar")); 1043 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1044 Address start_adr = seq_input->GetCharsAddress(); 1045 1046 NativeRegExpMacroAssembler::Result result = 1047 Execute(*code, 1048 *input, 1049 0, 1050 start_adr, 1051 start_adr + input->length(), 1052 NULL); 1053 1054 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1055 1056 result = Execute(*code, 1057 *input, 1058 3, 1059 start_adr + 3, 1060 start_adr + input->length(), 1061 NULL); 1062 1063 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1064} 1065 1066 1067TEST(MacroAssemblerNativeBackRefNoCase) { 1068 v8::V8::Initialize(); 1069 ContextInitializer initializer; 1070 Factory* factory = Isolate::Current()->factory(); 1071 1072 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 1073 1074 Label fail, succ; 1075 1076 m.WriteCurrentPositionToRegister(0, 0); 1077 m.WriteCurrentPositionToRegister(2, 0); 1078 m.AdvanceCurrentPosition(3); 1079 m.WriteCurrentPositionToRegister(3, 0); 1080 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "AbC". 1081 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "ABC". 1082 Label expected_fail; 1083 m.CheckNotBackReferenceIgnoreCase(2, &expected_fail); 1084 m.Bind(&fail); 1085 m.Fail(); 1086 1087 m.Bind(&expected_fail); 1088 m.AdvanceCurrentPosition(3); // Skip "xYz" 1089 m.CheckNotBackReferenceIgnoreCase(2, &succ); 1090 m.Fail(); 1091 1092 m.Bind(&succ); 1093 m.WriteCurrentPositionToRegister(1, 0); 1094 m.Succeed(); 1095 1096 Handle<String> source = 1097 factory->NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)")); 1098 Handle<Object> code_object = m.GetCode(source); 1099 Handle<Code> code = Handle<Code>::cast(code_object); 1100 1101 Handle<String> input = 1102 factory->NewStringFromAscii(CStrVector("aBcAbCABCxYzab")); 1103 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1104 Address start_adr = seq_input->GetCharsAddress(); 1105 1106 int output[4]; 1107 NativeRegExpMacroAssembler::Result result = 1108 Execute(*code, 1109 *input, 1110 0, 1111 start_adr, 1112 start_adr + input->length(), 1113 output); 1114 1115 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1116 CHECK_EQ(0, output[0]); 1117 CHECK_EQ(12, output[1]); 1118 CHECK_EQ(0, output[2]); 1119 CHECK_EQ(3, output[3]); 1120} 1121 1122 1123 1124TEST(MacroAssemblerNativeRegisters) { 1125 v8::V8::Initialize(); 1126 ContextInitializer initializer; 1127 Factory* factory = Isolate::Current()->factory(); 1128 1129 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6); 1130 1131 uc16 foo_chars[3] = {'f', 'o', 'o'}; 1132 Vector<const uc16> foo(foo_chars, 3); 1133 1134 enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt }; 1135 Label fail; 1136 Label backtrack; 1137 m.WriteCurrentPositionToRegister(out1, 0); // Output: [0] 1138 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1139 m.PushBacktrack(&backtrack); 1140 m.WriteStackPointerToRegister(sp); 1141 // Fill stack and registers 1142 m.AdvanceCurrentPosition(2); 1143 m.WriteCurrentPositionToRegister(out1, 0); 1144 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1145 m.PushBacktrack(&fail); 1146 // Drop backtrack stack frames. 1147 m.ReadStackPointerFromRegister(sp); 1148 // And take the first backtrack (to &backtrack) 1149 m.Backtrack(); 1150 1151 m.PushCurrentPosition(); 1152 m.AdvanceCurrentPosition(2); 1153 m.PopCurrentPosition(); 1154 1155 m.Bind(&backtrack); 1156 m.PopRegister(out1); 1157 m.ReadCurrentPositionFromRegister(out1); 1158 m.AdvanceCurrentPosition(3); 1159 m.WriteCurrentPositionToRegister(out2, 0); // [0,3] 1160 1161 Label loop; 1162 m.SetRegister(loop_cnt, 0); // loop counter 1163 m.Bind(&loop); 1164 m.AdvanceRegister(loop_cnt, 1); 1165 m.AdvanceCurrentPosition(1); 1166 m.IfRegisterLT(loop_cnt, 3, &loop); 1167 m.WriteCurrentPositionToRegister(out3, 0); // [0,3,6] 1168 1169 Label loop2; 1170 m.SetRegister(loop_cnt, 2); // loop counter 1171 m.Bind(&loop2); 1172 m.AdvanceRegister(loop_cnt, -1); 1173 m.AdvanceCurrentPosition(1); 1174 m.IfRegisterGE(loop_cnt, 0, &loop2); 1175 m.WriteCurrentPositionToRegister(out4, 0); // [0,3,6,9] 1176 1177 Label loop3; 1178 Label exit_loop3; 1179 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1180 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1181 m.ReadCurrentPositionFromRegister(out3); 1182 m.Bind(&loop3); 1183 m.AdvanceCurrentPosition(1); 1184 m.CheckGreedyLoop(&exit_loop3); 1185 m.GoTo(&loop3); 1186 m.Bind(&exit_loop3); 1187 m.PopCurrentPosition(); 1188 m.WriteCurrentPositionToRegister(out5, 0); // [0,3,6,9,9,-1] 1189 1190 m.Succeed(); 1191 1192 m.Bind(&fail); 1193 m.Fail(); 1194 1195 Handle<String> source = 1196 factory->NewStringFromAscii(CStrVector("<loop test>")); 1197 Handle<Object> code_object = m.GetCode(source); 1198 Handle<Code> code = Handle<Code>::cast(code_object); 1199 1200 // String long enough for test (content doesn't matter). 1201 Handle<String> input = 1202 factory->NewStringFromAscii(CStrVector("foofoofoofoofoo")); 1203 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1204 Address start_adr = seq_input->GetCharsAddress(); 1205 1206 int output[6]; 1207 NativeRegExpMacroAssembler::Result result = 1208 Execute(*code, 1209 *input, 1210 0, 1211 start_adr, 1212 start_adr + input->length(), 1213 output); 1214 1215 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1216 CHECK_EQ(0, output[0]); 1217 CHECK_EQ(3, output[1]); 1218 CHECK_EQ(6, output[2]); 1219 CHECK_EQ(9, output[3]); 1220 CHECK_EQ(9, output[4]); 1221 CHECK_EQ(-1, output[5]); 1222} 1223 1224 1225TEST(MacroAssemblerStackOverflow) { 1226 v8::V8::Initialize(); 1227 ContextInitializer initializer; 1228 Isolate* isolate = Isolate::Current(); 1229 Factory* factory = isolate->factory(); 1230 1231 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 1232 1233 Label loop; 1234 m.Bind(&loop); 1235 m.PushBacktrack(&loop); 1236 m.GoTo(&loop); 1237 1238 Handle<String> source = 1239 factory->NewStringFromAscii(CStrVector("<stack overflow test>")); 1240 Handle<Object> code_object = m.GetCode(source); 1241 Handle<Code> code = Handle<Code>::cast(code_object); 1242 1243 // String long enough for test (content doesn't matter). 1244 Handle<String> input = 1245 factory->NewStringFromAscii(CStrVector("dummy")); 1246 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1247 Address start_adr = seq_input->GetCharsAddress(); 1248 1249 NativeRegExpMacroAssembler::Result result = 1250 Execute(*code, 1251 *input, 1252 0, 1253 start_adr, 1254 start_adr + input->length(), 1255 NULL); 1256 1257 CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result); 1258 CHECK(isolate->has_pending_exception()); 1259 isolate->clear_pending_exception(); 1260} 1261 1262 1263TEST(MacroAssemblerNativeLotsOfRegisters) { 1264 v8::V8::Initialize(); 1265 ContextInitializer initializer; 1266 Isolate* isolate = Isolate::Current(); 1267 Factory* factory = isolate->factory(); 1268 1269 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2); 1270 1271 // At least 2048, to ensure the allocated space for registers 1272 // span one full page. 1273 const int large_number = 8000; 1274 m.WriteCurrentPositionToRegister(large_number, 42); 1275 m.WriteCurrentPositionToRegister(0, 0); 1276 m.WriteCurrentPositionToRegister(1, 1); 1277 Label done; 1278 m.CheckNotBackReference(0, &done); // Performs a system-stack push. 1279 m.Bind(&done); 1280 m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck); 1281 m.PopRegister(1); 1282 m.Succeed(); 1283 1284 Handle<String> source = 1285 factory->NewStringFromAscii(CStrVector("<huge register space test>")); 1286 Handle<Object> code_object = m.GetCode(source); 1287 Handle<Code> code = Handle<Code>::cast(code_object); 1288 1289 // String long enough for test (content doesn't matter). 1290 Handle<String> input = 1291 factory->NewStringFromAscii(CStrVector("sample text")); 1292 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1293 Address start_adr = seq_input->GetCharsAddress(); 1294 1295 int captures[2]; 1296 NativeRegExpMacroAssembler::Result result = 1297 Execute(*code, 1298 *input, 1299 0, 1300 start_adr, 1301 start_adr + input->length(), 1302 captures); 1303 1304 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1305 CHECK_EQ(0, captures[0]); 1306 CHECK_EQ(42, captures[1]); 1307 1308 isolate->clear_pending_exception(); 1309} 1310 1311#else // V8_INTERPRETED_REGEXP 1312 1313TEST(MacroAssembler) { 1314 V8::Initialize(NULL); 1315 byte codes[1024]; 1316 RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024)); 1317 // ^f(o)o. 1318 Label fail, fail2, start; 1319 uc16 foo_chars[3]; 1320 foo_chars[0] = 'f'; 1321 foo_chars[1] = 'o'; 1322 foo_chars[2] = 'o'; 1323 Vector<const uc16> foo(foo_chars, 3); 1324 m.SetRegister(4, 42); 1325 m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck); 1326 m.AdvanceRegister(4, 42); 1327 m.GoTo(&start); 1328 m.Fail(); 1329 m.Bind(&start); 1330 m.PushBacktrack(&fail2); 1331 m.CheckCharacters(foo, 0, &fail, true); 1332 m.WriteCurrentPositionToRegister(0, 0); 1333 m.PushCurrentPosition(); 1334 m.AdvanceCurrentPosition(3); 1335 m.WriteCurrentPositionToRegister(1, 0); 1336 m.PopCurrentPosition(); 1337 m.AdvanceCurrentPosition(1); 1338 m.WriteCurrentPositionToRegister(2, 0); 1339 m.AdvanceCurrentPosition(1); 1340 m.WriteCurrentPositionToRegister(3, 0); 1341 m.Succeed(); 1342 1343 m.Bind(&fail); 1344 m.Backtrack(); 1345 m.Succeed(); 1346 1347 m.Bind(&fail2); 1348 m.PopRegister(0); 1349 m.Fail(); 1350 1351 Isolate* isolate = Isolate::Current(); 1352 Factory* factory = isolate->factory(); 1353 HandleScope scope(isolate); 1354 1355 Handle<String> source = factory->NewStringFromAscii(CStrVector("^f(o)o")); 1356 Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source)); 1357 int captures[5]; 1358 1359 const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'}; 1360 Handle<String> f1_16 = 1361 factory->NewStringFromTwoByte(Vector<const uc16>(str1, 6)); 1362 1363 CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0)); 1364 CHECK_EQ(0, captures[0]); 1365 CHECK_EQ(3, captures[1]); 1366 CHECK_EQ(1, captures[2]); 1367 CHECK_EQ(2, captures[3]); 1368 CHECK_EQ(84, captures[4]); 1369 1370 const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'}; 1371 Handle<String> f2_16 = 1372 factory->NewStringFromTwoByte(Vector<const uc16>(str2, 6)); 1373 1374 CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0)); 1375 CHECK_EQ(42, captures[0]); 1376} 1377 1378#endif // V8_INTERPRETED_REGEXP 1379 1380 1381TEST(AddInverseToTable) { 1382 v8::internal::V8::Initialize(NULL); 1383 static const int kLimit = 1000; 1384 static const int kRangeCount = 16; 1385 for (int t = 0; t < 10; t++) { 1386 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1387 ZoneList<CharacterRange>* ranges = 1388 new ZoneList<CharacterRange>(kRangeCount); 1389 for (int i = 0; i < kRangeCount; i++) { 1390 int from = PseudoRandom(t + 87, i + 25) % kLimit; 1391 int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20)); 1392 if (to > kLimit) to = kLimit; 1393 ranges->Add(CharacterRange(from, to)); 1394 } 1395 DispatchTable table; 1396 DispatchTableConstructor cons(&table, false); 1397 cons.set_choice_index(0); 1398 cons.AddInverse(ranges); 1399 for (int i = 0; i < kLimit; i++) { 1400 bool is_on = false; 1401 for (int j = 0; !is_on && j < kRangeCount; j++) 1402 is_on = ranges->at(j).Contains(i); 1403 OutSet* set = table.Get(i); 1404 CHECK_EQ(is_on, set->Get(0) == false); 1405 } 1406 } 1407 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1408 ZoneList<CharacterRange>* ranges = 1409 new ZoneList<CharacterRange>(1); 1410 ranges->Add(CharacterRange(0xFFF0, 0xFFFE)); 1411 DispatchTable table; 1412 DispatchTableConstructor cons(&table, false); 1413 cons.set_choice_index(0); 1414 cons.AddInverse(ranges); 1415 CHECK(!table.Get(0xFFFE)->Get(0)); 1416 CHECK(table.Get(0xFFFF)->Get(0)); 1417} 1418 1419 1420static uc32 canonicalize(uc32 c) { 1421 unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth]; 1422 int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL); 1423 if (count == 0) { 1424 return c; 1425 } else { 1426 CHECK_EQ(1, count); 1427 return canon[0]; 1428 } 1429} 1430 1431 1432TEST(LatinCanonicalize) { 1433 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1434 for (char lower = 'a'; lower <= 'z'; lower++) { 1435 char upper = lower + ('A' - 'a'); 1436 CHECK_EQ(canonicalize(lower), canonicalize(upper)); 1437 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1438 int length = un_canonicalize.get(lower, '\0', uncanon); 1439 CHECK_EQ(2, length); 1440 CHECK_EQ(upper, uncanon[0]); 1441 CHECK_EQ(lower, uncanon[1]); 1442 } 1443 for (uc32 c = 128; c < (1 << 21); c++) 1444 CHECK_GE(canonicalize(c), 128); 1445 unibrow::Mapping<unibrow::ToUppercase> to_upper; 1446 // Canonicalization is only defined for the Basic Multilingual Plane. 1447 for (uc32 c = 0; c < (1 << 16); c++) { 1448 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth]; 1449 int length = to_upper.get(c, '\0', upper); 1450 if (length == 0) { 1451 length = 1; 1452 upper[0] = c; 1453 } 1454 uc32 u = upper[0]; 1455 if (length > 1 || (c >= 128 && u < 128)) 1456 u = c; 1457 CHECK_EQ(u, canonicalize(c)); 1458 } 1459} 1460 1461 1462static uc32 CanonRangeEnd(uc32 c) { 1463 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; 1464 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); 1465 if (count == 0) { 1466 return c; 1467 } else { 1468 CHECK_EQ(1, count); 1469 return canon[0]; 1470 } 1471} 1472 1473 1474TEST(RangeCanonicalization) { 1475 // Check that we arrive at the same result when using the basic 1476 // range canonicalization primitives as when using immediate 1477 // canonicalization. 1478 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1479 int block_start = 0; 1480 while (block_start <= 0xFFFF) { 1481 uc32 block_end = CanonRangeEnd(block_start); 1482 unsigned block_length = block_end - block_start + 1; 1483 if (block_length > 1) { 1484 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1485 int first_length = un_canonicalize.get(block_start, '\0', first); 1486 for (unsigned i = 1; i < block_length; i++) { 1487 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1488 int succ_length = un_canonicalize.get(block_start + i, '\0', succ); 1489 CHECK_EQ(first_length, succ_length); 1490 for (int j = 0; j < succ_length; j++) { 1491 int calc = first[j] + i; 1492 int found = succ[j]; 1493 CHECK_EQ(calc, found); 1494 } 1495 } 1496 } 1497 block_start = block_start + block_length; 1498 } 1499} 1500 1501 1502TEST(UncanonicalizeEquivalence) { 1503 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1504 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1505 for (int i = 0; i < (1 << 16); i++) { 1506 int length = un_canonicalize.get(i, '\0', chars); 1507 for (int j = 0; j < length; j++) { 1508 unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1509 int length2 = un_canonicalize.get(chars[j], '\0', chars2); 1510 CHECK_EQ(length, length2); 1511 for (int k = 0; k < length; k++) 1512 CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k])); 1513 } 1514 } 1515} 1516 1517 1518static void TestRangeCaseIndependence(CharacterRange input, 1519 Vector<CharacterRange> expected) { 1520 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1521 int count = expected.length(); 1522 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(count); 1523 input.AddCaseEquivalents(list, false); 1524 CHECK_EQ(count, list->length()); 1525 for (int i = 0; i < list->length(); i++) { 1526 CHECK_EQ(expected[i].from(), list->at(i).from()); 1527 CHECK_EQ(expected[i].to(), list->at(i).to()); 1528 } 1529} 1530 1531 1532static void TestSimpleRangeCaseIndependence(CharacterRange input, 1533 CharacterRange expected) { 1534 EmbeddedVector<CharacterRange, 1> vector; 1535 vector[0] = expected; 1536 TestRangeCaseIndependence(input, vector); 1537} 1538 1539 1540TEST(CharacterRangeCaseIndependence) { 1541 v8::internal::V8::Initialize(NULL); 1542 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'), 1543 CharacterRange::Singleton('A')); 1544 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'), 1545 CharacterRange::Singleton('Z')); 1546 TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'), 1547 CharacterRange('A', 'Z')); 1548 TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'), 1549 CharacterRange('C', 'F')); 1550 TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'), 1551 CharacterRange('A', 'B')); 1552 TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'), 1553 CharacterRange('Y', 'Z')); 1554 TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1), 1555 CharacterRange('A', 'Z')); 1556 TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'), 1557 CharacterRange('a', 'z')); 1558 TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'), 1559 CharacterRange('c', 'f')); 1560 TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1), 1561 CharacterRange('a', 'z')); 1562 // Here we need to add [l-z] to complete the case independence of 1563 // [A-Za-z] but we expect [a-z] to be added since we always add a 1564 // whole block at a time. 1565 TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'), 1566 CharacterRange('a', 'z')); 1567} 1568 1569 1570static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) { 1571 if (ranges == NULL) 1572 return false; 1573 for (int i = 0; i < ranges->length(); i++) { 1574 CharacterRange range = ranges->at(i); 1575 if (range.from() <= c && c <= range.to()) 1576 return true; 1577 } 1578 return false; 1579} 1580 1581 1582TEST(CharClassDifference) { 1583 v8::internal::V8::Initialize(NULL); 1584 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1585 ZoneList<CharacterRange>* base = new ZoneList<CharacterRange>(1); 1586 base->Add(CharacterRange::Everything()); 1587 Vector<const uc16> overlay = CharacterRange::GetWordBounds(); 1588 ZoneList<CharacterRange>* included = NULL; 1589 ZoneList<CharacterRange>* excluded = NULL; 1590 CharacterRange::Split(base, overlay, &included, &excluded); 1591 for (int i = 0; i < (1 << 16); i++) { 1592 bool in_base = InClass(i, base); 1593 if (in_base) { 1594 bool in_overlay = false; 1595 for (int j = 0; !in_overlay && j < overlay.length(); j += 2) { 1596 if (overlay[j] <= i && i <= overlay[j+1]) 1597 in_overlay = true; 1598 } 1599 CHECK_EQ(in_overlay, InClass(i, included)); 1600 CHECK_EQ(!in_overlay, InClass(i, excluded)); 1601 } else { 1602 CHECK(!InClass(i, included)); 1603 CHECK(!InClass(i, excluded)); 1604 } 1605 } 1606} 1607 1608 1609TEST(CanonicalizeCharacterSets) { 1610 v8::internal::V8::Initialize(NULL); 1611 ZoneScope scope(Isolate::Current(), DELETE_ON_EXIT); 1612 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4); 1613 CharacterSet set(list); 1614 1615 list->Add(CharacterRange(10, 20)); 1616 list->Add(CharacterRange(30, 40)); 1617 list->Add(CharacterRange(50, 60)); 1618 set.Canonicalize(); 1619 ASSERT_EQ(3, list->length()); 1620 ASSERT_EQ(10, list->at(0).from()); 1621 ASSERT_EQ(20, list->at(0).to()); 1622 ASSERT_EQ(30, list->at(1).from()); 1623 ASSERT_EQ(40, list->at(1).to()); 1624 ASSERT_EQ(50, list->at(2).from()); 1625 ASSERT_EQ(60, list->at(2).to()); 1626 1627 list->Rewind(0); 1628 list->Add(CharacterRange(10, 20)); 1629 list->Add(CharacterRange(50, 60)); 1630 list->Add(CharacterRange(30, 40)); 1631 set.Canonicalize(); 1632 ASSERT_EQ(3, list->length()); 1633 ASSERT_EQ(10, list->at(0).from()); 1634 ASSERT_EQ(20, list->at(0).to()); 1635 ASSERT_EQ(30, list->at(1).from()); 1636 ASSERT_EQ(40, list->at(1).to()); 1637 ASSERT_EQ(50, list->at(2).from()); 1638 ASSERT_EQ(60, list->at(2).to()); 1639 1640 list->Rewind(0); 1641 list->Add(CharacterRange(30, 40)); 1642 list->Add(CharacterRange(10, 20)); 1643 list->Add(CharacterRange(25, 25)); 1644 list->Add(CharacterRange(100, 100)); 1645 list->Add(CharacterRange(1, 1)); 1646 set.Canonicalize(); 1647 ASSERT_EQ(5, list->length()); 1648 ASSERT_EQ(1, list->at(0).from()); 1649 ASSERT_EQ(1, list->at(0).to()); 1650 ASSERT_EQ(10, list->at(1).from()); 1651 ASSERT_EQ(20, list->at(1).to()); 1652 ASSERT_EQ(25, list->at(2).from()); 1653 ASSERT_EQ(25, list->at(2).to()); 1654 ASSERT_EQ(30, list->at(3).from()); 1655 ASSERT_EQ(40, list->at(3).to()); 1656 ASSERT_EQ(100, list->at(4).from()); 1657 ASSERT_EQ(100, list->at(4).to()); 1658 1659 list->Rewind(0); 1660 list->Add(CharacterRange(10, 19)); 1661 list->Add(CharacterRange(21, 30)); 1662 list->Add(CharacterRange(20, 20)); 1663 set.Canonicalize(); 1664 ASSERT_EQ(1, list->length()); 1665 ASSERT_EQ(10, list->at(0).from()); 1666 ASSERT_EQ(30, list->at(0).to()); 1667} 1668 1669// Checks whether a character is in the set represented by a list of ranges. 1670static bool CharacterInSet(ZoneList<CharacterRange>* set, uc16 value) { 1671 for (int i = 0; i < set->length(); i++) { 1672 CharacterRange range = set->at(i); 1673 if (range.from() <= value && value <= range.to()) { 1674 return true; 1675 } 1676 } 1677 return false; 1678} 1679 1680TEST(CharacterRangeMerge) { 1681 v8::internal::V8::Initialize(NULL); 1682 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1683 ZoneList<CharacterRange> l1(4); 1684 ZoneList<CharacterRange> l2(4); 1685 // Create all combinations of intersections of ranges, both singletons and 1686 // longer. 1687 1688 int offset = 0; 1689 1690 // The five kinds of singleton intersections: 1691 // X 1692 // Y - outside before 1693 // Y - outside touching start 1694 // Y - overlap 1695 // Y - outside touching end 1696 // Y - outside after 1697 1698 for (int i = 0; i < 5; i++) { 1699 l1.Add(CharacterRange::Singleton(offset + 2)); 1700 l2.Add(CharacterRange::Singleton(offset + i)); 1701 offset += 6; 1702 } 1703 1704 // The seven kinds of singleton/non-singleton intersections: 1705 // XXX 1706 // Y - outside before 1707 // Y - outside touching start 1708 // Y - inside touching start 1709 // Y - entirely inside 1710 // Y - inside touching end 1711 // Y - outside touching end 1712 // Y - disjoint after 1713 1714 for (int i = 0; i < 7; i++) { 1715 l1.Add(CharacterRange::Range(offset + 2, offset + 4)); 1716 l2.Add(CharacterRange::Singleton(offset + i)); 1717 offset += 8; 1718 } 1719 1720 // The eleven kinds of non-singleton intersections: 1721 // 1722 // XXXXXXXX 1723 // YYYY - outside before. 1724 // YYYY - outside touching start. 1725 // YYYY - overlapping start 1726 // YYYY - inside touching start 1727 // YYYY - entirely inside 1728 // YYYY - inside touching end 1729 // YYYY - overlapping end 1730 // YYYY - outside touching end 1731 // YYYY - outside after 1732 // YYYYYYYY - identical 1733 // YYYYYYYYYYYY - containing entirely. 1734 1735 for (int i = 0; i < 9; i++) { 1736 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); // Length 8. 1737 l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3)); 1738 offset += 22; 1739 } 1740 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); 1741 l2.Add(CharacterRange::Range(offset + 6, offset + 15)); 1742 offset += 22; 1743 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); 1744 l2.Add(CharacterRange::Range(offset + 4, offset + 17)); 1745 offset += 22; 1746 1747 // Different kinds of multi-range overlap: 1748 // XXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXX 1749 // YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y 1750 1751 l1.Add(CharacterRange::Range(offset, offset + 21)); 1752 l1.Add(CharacterRange::Range(offset + 31, offset + 52)); 1753 for (int i = 0; i < 6; i++) { 1754 l2.Add(CharacterRange::Range(offset + 2, offset + 5)); 1755 l2.Add(CharacterRange::Singleton(offset + 8)); 1756 offset += 9; 1757 } 1758 1759 ASSERT(CharacterRange::IsCanonical(&l1)); 1760 ASSERT(CharacterRange::IsCanonical(&l2)); 1761 1762 ZoneList<CharacterRange> first_only(4); 1763 ZoneList<CharacterRange> second_only(4); 1764 ZoneList<CharacterRange> both(4); 1765 1766 // Merge one direction. 1767 CharacterRange::Merge(&l1, &l2, &first_only, &second_only, &both); 1768 1769 CHECK(CharacterRange::IsCanonical(&first_only)); 1770 CHECK(CharacterRange::IsCanonical(&second_only)); 1771 CHECK(CharacterRange::IsCanonical(&both)); 1772 1773 for (uc16 i = 0; i < offset; i++) { 1774 bool in_first = CharacterInSet(&l1, i); 1775 bool in_second = CharacterInSet(&l2, i); 1776 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); 1777 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); 1778 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1779 } 1780 1781 first_only.Clear(); 1782 second_only.Clear(); 1783 both.Clear(); 1784 1785 // Merge other direction. 1786 CharacterRange::Merge(&l2, &l1, &second_only, &first_only, &both); 1787 1788 CHECK(CharacterRange::IsCanonical(&first_only)); 1789 CHECK(CharacterRange::IsCanonical(&second_only)); 1790 CHECK(CharacterRange::IsCanonical(&both)); 1791 1792 for (uc16 i = 0; i < offset; i++) { 1793 bool in_first = CharacterInSet(&l1, i); 1794 bool in_second = CharacterInSet(&l2, i); 1795 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); 1796 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); 1797 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1798 } 1799 1800 first_only.Clear(); 1801 second_only.Clear(); 1802 both.Clear(); 1803 1804 // Merge but don't record all combinations. 1805 CharacterRange::Merge(&l1, &l2, NULL, NULL, &both); 1806 1807 CHECK(CharacterRange::IsCanonical(&both)); 1808 1809 for (uc16 i = 0; i < offset; i++) { 1810 bool in_first = CharacterInSet(&l1, i); 1811 bool in_second = CharacterInSet(&l2, i); 1812 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1813 } 1814 1815 // Merge into same set. 1816 ZoneList<CharacterRange> all(4); 1817 CharacterRange::Merge(&l1, &l2, &all, &all, &all); 1818 1819 CHECK(CharacterRange::IsCanonical(&all)); 1820 1821 for (uc16 i = 0; i < offset; i++) { 1822 bool in_first = CharacterInSet(&l1, i); 1823 bool in_second = CharacterInSet(&l2, i); 1824 CHECK((in_first || in_second) == CharacterInSet(&all, i)); 1825 } 1826} 1827 1828 1829TEST(Graph) { 1830 V8::Initialize(NULL); 1831 Execute("\\b\\w+\\b", false, true, true); 1832} 1833