1// Copyright 2012 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 29#include <stdlib.h> 30 31#include "v8.h" 32 33#include "ast.h" 34#include "char-predicates-inl.h" 35#include "cctest.h" 36#include "jsregexp.h" 37#include "parser.h" 38#include "regexp-macro-assembler.h" 39#include "regexp-macro-assembler-irregexp.h" 40#include "string-stream.h" 41#include "zone-inl.h" 42#ifdef V8_INTERPRETED_REGEXP 43#include "interpreter-irregexp.h" 44#else // V8_INTERPRETED_REGEXP 45#include "macro-assembler.h" 46#include "code.h" 47#if V8_TARGET_ARCH_ARM 48#include "arm/assembler-arm.h" 49#include "arm/macro-assembler-arm.h" 50#include "arm/regexp-macro-assembler-arm.h" 51#endif 52#if V8_TARGET_ARCH_MIPS 53#include "mips/assembler-mips.h" 54#include "mips/macro-assembler-mips.h" 55#include "mips/regexp-macro-assembler-mips.h" 56#endif 57#if V8_TARGET_ARCH_X64 58#include "x64/assembler-x64.h" 59#include "x64/macro-assembler-x64.h" 60#include "x64/regexp-macro-assembler-x64.h" 61#endif 62#if V8_TARGET_ARCH_IA32 63#include "ia32/assembler-ia32.h" 64#include "ia32/macro-assembler-ia32.h" 65#include "ia32/regexp-macro-assembler-ia32.h" 66#endif 67#endif // V8_INTERPRETED_REGEXP 68 69using namespace v8::internal; 70 71 72static bool CheckParse(const char* input) { 73 V8::Initialize(NULL); 74 v8::HandleScope scope(v8::Isolate::GetCurrent()); 75 Zone zone(Isolate::Current()); 76 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 77 RegExpCompileData result; 78 return v8::internal::RegExpParser::ParseRegExp( 79 &reader, false, &result, &zone); 80} 81 82 83static SmartArrayPointer<const char> Parse(const char* input) { 84 V8::Initialize(NULL); 85 v8::HandleScope scope(v8::Isolate::GetCurrent()); 86 Zone zone(Isolate::Current()); 87 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 88 RegExpCompileData result; 89 CHECK(v8::internal::RegExpParser::ParseRegExp( 90 &reader, false, &result, &zone)); 91 CHECK(result.tree != NULL); 92 CHECK(result.error.is_null()); 93 SmartArrayPointer<const char> output = result.tree->ToString(&zone); 94 return output; 95} 96 97 98static bool CheckSimple(const char* input) { 99 V8::Initialize(NULL); 100 v8::HandleScope scope(v8::Isolate::GetCurrent()); 101 Zone zone(Isolate::Current()); 102 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 103 RegExpCompileData result; 104 CHECK(v8::internal::RegExpParser::ParseRegExp( 105 &reader, false, &result, &zone)); 106 CHECK(result.tree != NULL); 107 CHECK(result.error.is_null()); 108 return result.simple; 109} 110 111struct MinMaxPair { 112 int min_match; 113 int max_match; 114}; 115 116 117static MinMaxPair CheckMinMaxMatch(const char* input) { 118 V8::Initialize(NULL); 119 v8::HandleScope scope(v8::Isolate::GetCurrent()); 120 Zone zone(Isolate::Current()); 121 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 122 RegExpCompileData result; 123 CHECK(v8::internal::RegExpParser::ParseRegExp( 124 &reader, false, &result, &zone)); 125 CHECK(result.tree != NULL); 126 CHECK(result.error.is_null()); 127 int min_match = result.tree->min_match(); 128 int max_match = result.tree->max_match(); 129 MinMaxPair pair = { min_match, max_match }; 130 return pair; 131} 132 133 134#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input)) 135#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input)) 136#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input)); 137#define CHECK_MIN_MAX(input, min, max) \ 138 { MinMaxPair min_max = CheckMinMaxMatch(input); \ 139 CHECK_EQ(min, min_max.min_match); \ 140 CHECK_EQ(max, min_max.max_match); \ 141 } 142 143TEST(Parser) { 144 V8::Initialize(NULL); 145 146 CHECK_PARSE_ERROR("?"); 147 148 CHECK_PARSE_EQ("abc", "'abc'"); 149 CHECK_PARSE_EQ("", "%"); 150 CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')"); 151 CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')"); 152 CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)"); 153 CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')"); 154 CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])"); 155 CHECK_PARSE_EQ("a*", "(# 0 - g 'a')"); 156 CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')"); 157 CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))"); 158 CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))"); 159 CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))"); 160 CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))"); 161 CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))"); 162 CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))"); 163 CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))"); 164 CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))"); 165 CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))"); 166 CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))"); 167 CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))"); 168 CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))"); 169 CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'"); 170 CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')"); 171 CHECK_PARSE_EQ("(?:foo)", "'foo'"); 172 CHECK_PARSE_EQ("(?: foo )", "' foo '"); 173 CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))"); 174 CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')"); 175 CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')"); 176 CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')"); 177 CHECK_PARSE_EQ("()", "(^ %)"); 178 CHECK_PARSE_EQ("(?=)", "(-> + %)"); 179 CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows 180 CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252 181 CHECK_PARSE_EQ("[x]", "[x]"); 182 CHECK_PARSE_EQ("[xyz]", "[x y z]"); 183 CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]"); 184 CHECK_PARSE_EQ("[-123]", "[- 1 2 3]"); 185 CHECK_PARSE_EQ("[^123]", "^[1 2 3]"); 186 CHECK_PARSE_EQ("]", "']'"); 187 CHECK_PARSE_EQ("}", "'}'"); 188 CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]"); 189 CHECK_PARSE_EQ("[\\d]", "[0-9]"); 190 CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]"); 191 CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]"); 192 CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]"); 193 CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]"); 194 // Control character outside character class. 195 CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK", 196 "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'"); 197 CHECK_PARSE_EQ("\\c!", "'\\c!'"); 198 CHECK_PARSE_EQ("\\c_", "'\\c_'"); 199 CHECK_PARSE_EQ("\\c~", "'\\c~'"); 200 CHECK_PARSE_EQ("\\c1", "'\\c1'"); 201 // Control character inside character class. 202 CHECK_PARSE_EQ("[\\c!]", "[\\ c !]"); 203 CHECK_PARSE_EQ("[\\c_]", "[\\x1f]"); 204 CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]"); 205 CHECK_PARSE_EQ("[\\ca]", "[\\x01]"); 206 CHECK_PARSE_EQ("[\\cz]", "[\\x1a]"); 207 CHECK_PARSE_EQ("[\\cA]", "[\\x01]"); 208 CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]"); 209 CHECK_PARSE_EQ("[\\c1]", "[\\x11]"); 210 211 CHECK_PARSE_EQ("[a\\]c]", "[a ] c]"); 212 CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '"); 213 CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]"); 214 CHECK_PARSE_EQ("\\0", "'\\x00'"); 215 CHECK_PARSE_EQ("\\8", "'8'"); 216 CHECK_PARSE_EQ("\\9", "'9'"); 217 CHECK_PARSE_EQ("\\11", "'\\x09'"); 218 CHECK_PARSE_EQ("\\11a", "'\\x09a'"); 219 CHECK_PARSE_EQ("\\011", "'\\x09'"); 220 CHECK_PARSE_EQ("\\00011", "'\\x0011'"); 221 CHECK_PARSE_EQ("\\118", "'\\x098'"); 222 CHECK_PARSE_EQ("\\111", "'I'"); 223 CHECK_PARSE_EQ("\\1111", "'I1'"); 224 CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))"); 225 CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))"); 226 CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))"); 227 CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')"); 228 CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')" 229 " (# 0 - g (<- 1)))"); 230 CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')" 231 " (# 0 - g (<- 2)))"); 232 CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')" 233 " (# 0 - g (<- 3)))"); 234 CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')" 235 " (# 0 - g '\\x04'))"); 236 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10", 237 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 238 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))"); 239 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11", 240 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 241 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')"); 242 CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))"); 243 CHECK_PARSE_EQ("(a\\1)", "(^ 'a')"); 244 CHECK_PARSE_EQ("(\\1a)", "(^ 'a')"); 245 CHECK_PARSE_EQ("(?=a)?a", "'a'"); 246 CHECK_PARSE_EQ("(?=a){0,10}a", "'a'"); 247 CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')"); 248 CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')"); 249 CHECK_PARSE_EQ("(?!a)?a", "'a'"); 250 CHECK_PARSE_EQ("\\1(a)", "(^ 'a')"); 251 CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))"); 252 CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))"); 253 CHECK_PARSE_EQ("[\\0]", "[\\x00]"); 254 CHECK_PARSE_EQ("[\\11]", "[\\x09]"); 255 CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]"); 256 CHECK_PARSE_EQ("[\\011]", "[\\x09]"); 257 CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]"); 258 CHECK_PARSE_EQ("[\\118]", "[\\x09 8]"); 259 CHECK_PARSE_EQ("[\\111]", "[I]"); 260 CHECK_PARSE_EQ("[\\1111]", "[I 1]"); 261 CHECK_PARSE_EQ("\\x34", "'\x34'"); 262 CHECK_PARSE_EQ("\\x60", "'\x60'"); 263 CHECK_PARSE_EQ("\\x3z", "'x3z'"); 264 CHECK_PARSE_EQ("\\c", "'\\c'"); 265 CHECK_PARSE_EQ("\\u0034", "'\x34'"); 266 CHECK_PARSE_EQ("\\u003z", "'u003z'"); 267 CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))"); 268 269 CHECK_SIMPLE("", false); 270 CHECK_SIMPLE("a", true); 271 CHECK_SIMPLE("a|b", false); 272 CHECK_SIMPLE("a\\n", false); 273 CHECK_SIMPLE("^a", false); 274 CHECK_SIMPLE("a$", false); 275 CHECK_SIMPLE("a\\b!", false); 276 CHECK_SIMPLE("a\\Bb", false); 277 CHECK_SIMPLE("a*", false); 278 CHECK_SIMPLE("a*?", false); 279 CHECK_SIMPLE("a?", false); 280 CHECK_SIMPLE("a??", false); 281 CHECK_SIMPLE("a{0,1}?", false); 282 CHECK_SIMPLE("a{1,1}?", false); 283 CHECK_SIMPLE("a{1,2}?", false); 284 CHECK_SIMPLE("a+?", false); 285 CHECK_SIMPLE("(a)", false); 286 CHECK_SIMPLE("(a)\\1", false); 287 CHECK_SIMPLE("(\\1a)", false); 288 CHECK_SIMPLE("\\1(a)", false); 289 CHECK_SIMPLE("a\\s", false); 290 CHECK_SIMPLE("a\\S", false); 291 CHECK_SIMPLE("a\\d", false); 292 CHECK_SIMPLE("a\\D", false); 293 CHECK_SIMPLE("a\\w", false); 294 CHECK_SIMPLE("a\\W", false); 295 CHECK_SIMPLE("a.", false); 296 CHECK_SIMPLE("a\\q", false); 297 CHECK_SIMPLE("a[a]", false); 298 CHECK_SIMPLE("a[^a]", false); 299 CHECK_SIMPLE("a[a-z]", false); 300 CHECK_SIMPLE("a[\\q]", false); 301 CHECK_SIMPLE("a(?:b)", false); 302 CHECK_SIMPLE("a(?=b)", false); 303 CHECK_SIMPLE("a(?!b)", false); 304 CHECK_SIMPLE("\\x60", false); 305 CHECK_SIMPLE("\\u0060", false); 306 CHECK_SIMPLE("\\cA", false); 307 CHECK_SIMPLE("\\q", false); 308 CHECK_SIMPLE("\\1112", false); 309 CHECK_SIMPLE("\\0", false); 310 CHECK_SIMPLE("(a)\\1", false); 311 CHECK_SIMPLE("(?=a)?a", false); 312 CHECK_SIMPLE("(?!a)?a\\1", false); 313 CHECK_SIMPLE("(?:(?=a))a\\1", false); 314 315 CHECK_PARSE_EQ("a{}", "'a{}'"); 316 CHECK_PARSE_EQ("a{,}", "'a{,}'"); 317 CHECK_PARSE_EQ("a{", "'a{'"); 318 CHECK_PARSE_EQ("a{z}", "'a{z}'"); 319 CHECK_PARSE_EQ("a{1z}", "'a{1z}'"); 320 CHECK_PARSE_EQ("a{12z}", "'a{12z}'"); 321 CHECK_PARSE_EQ("a{12,", "'a{12,'"); 322 CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'"); 323 CHECK_PARSE_EQ("{}", "'{}'"); 324 CHECK_PARSE_EQ("{,}", "'{,}'"); 325 CHECK_PARSE_EQ("{", "'{'"); 326 CHECK_PARSE_EQ("{z}", "'{z}'"); 327 CHECK_PARSE_EQ("{1z}", "'{1z}'"); 328 CHECK_PARSE_EQ("{12z}", "'{12z}'"); 329 CHECK_PARSE_EQ("{12,", "'{12,'"); 330 CHECK_PARSE_EQ("{12,3b", "'{12,3b'"); 331 332 CHECK_MIN_MAX("a", 1, 1); 333 CHECK_MIN_MAX("abc", 3, 3); 334 CHECK_MIN_MAX("a[bc]d", 3, 3); 335 CHECK_MIN_MAX("a|bc", 1, 2); 336 CHECK_MIN_MAX("ab|c", 1, 2); 337 CHECK_MIN_MAX("a||bc", 0, 2); 338 CHECK_MIN_MAX("|", 0, 0); 339 CHECK_MIN_MAX("(?:ab)", 2, 2); 340 CHECK_MIN_MAX("(?:ab|cde)", 2, 3); 341 CHECK_MIN_MAX("(?:ab)|cde", 2, 3); 342 CHECK_MIN_MAX("(ab)", 2, 2); 343 CHECK_MIN_MAX("(ab|cde)", 2, 3); 344 CHECK_MIN_MAX("(ab)\\1", 2, 4); 345 CHECK_MIN_MAX("(ab|cde)\\1", 2, 6); 346 CHECK_MIN_MAX("(?:ab)?", 0, 2); 347 CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity); 348 CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity); 349 CHECK_MIN_MAX("a?", 0, 1); 350 CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity); 351 CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity); 352 CHECK_MIN_MAX("a??", 0, 1); 353 CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity); 354 CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity); 355 CHECK_MIN_MAX("(?:a?)?", 0, 1); 356 CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity); 357 CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity); 358 CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity); 359 CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity); 360 CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity); 361 CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity); 362 CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity); 363 CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity); 364 CHECK_MIN_MAX("a{0}", 0, 0); 365 CHECK_MIN_MAX("(?:a+){0}", 0, 0); 366 CHECK_MIN_MAX("(?:a+){0,0}", 0, 0); 367 CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity); 368 CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity); 369 CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity); 370 CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity); 371 CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity); 372 CHECK_MIN_MAX("(?:ab){4,7}", 8, 14); 373 CHECK_MIN_MAX("a\\bc", 2, 2); 374 CHECK_MIN_MAX("a\\Bc", 2, 2); 375 CHECK_MIN_MAX("a\\sc", 3, 3); 376 CHECK_MIN_MAX("a\\Sc", 3, 3); 377 CHECK_MIN_MAX("a(?=b)c", 2, 2); 378 CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2); 379 CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2); 380} 381 382 383TEST(ParserRegression) { 384 CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])"); 385 CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')"); 386 CHECK_PARSE_EQ("{", "'{'"); 387 CHECK_PARSE_EQ("a|", "(| 'a' %)"); 388} 389 390static void ExpectError(const char* input, 391 const char* expected) { 392 V8::Initialize(NULL); 393 v8::HandleScope scope(v8::Isolate::GetCurrent()); 394 Zone zone(Isolate::Current()); 395 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 396 RegExpCompileData result; 397 CHECK(!v8::internal::RegExpParser::ParseRegExp( 398 &reader, false, &result, &zone)); 399 CHECK(result.tree == NULL); 400 CHECK(!result.error.is_null()); 401 SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS); 402 CHECK_EQ(expected, *str); 403} 404 405 406TEST(Errors) { 407 V8::Initialize(NULL); 408 const char* kEndBackslash = "\\ at end of pattern"; 409 ExpectError("\\", kEndBackslash); 410 const char* kUnterminatedGroup = "Unterminated group"; 411 ExpectError("(foo", kUnterminatedGroup); 412 const char* kInvalidGroup = "Invalid group"; 413 ExpectError("(?", kInvalidGroup); 414 const char* kUnterminatedCharacterClass = "Unterminated character class"; 415 ExpectError("[", kUnterminatedCharacterClass); 416 ExpectError("[a-", kUnterminatedCharacterClass); 417 const char* kNothingToRepeat = "Nothing to repeat"; 418 ExpectError("*", kNothingToRepeat); 419 ExpectError("?", kNothingToRepeat); 420 ExpectError("+", kNothingToRepeat); 421 ExpectError("{1}", kNothingToRepeat); 422 ExpectError("{1,2}", kNothingToRepeat); 423 ExpectError("{1,}", kNothingToRepeat); 424 425 // Check that we don't allow more than kMaxCapture captures 426 const int kMaxCaptures = 1 << 16; // Must match RegExpParser::kMaxCaptures. 427 const char* kTooManyCaptures = "Too many captures"; 428 HeapStringAllocator allocator; 429 StringStream accumulator(&allocator); 430 for (int i = 0; i <= kMaxCaptures; i++) { 431 accumulator.Add("()"); 432 } 433 SmartArrayPointer<const char> many_captures(accumulator.ToCString()); 434 ExpectError(*many_captures, kTooManyCaptures); 435} 436 437 438static bool IsDigit(uc16 c) { 439 return ('0' <= c && c <= '9'); 440} 441 442 443static bool NotDigit(uc16 c) { 444 return !IsDigit(c); 445} 446 447 448static bool IsWhiteSpace(uc16 c) { 449 switch (c) { 450 case 0x09: 451 case 0x0A: 452 case 0x0B: 453 case 0x0C: 454 case 0x0d: 455 case 0x20: 456 case 0xA0: 457 case 0x2028: 458 case 0x2029: 459 case 0xFEFF: 460 return true; 461 default: 462 return unibrow::Space::Is(c); 463 } 464} 465 466 467static bool NotWhiteSpace(uc16 c) { 468 return !IsWhiteSpace(c); 469} 470 471 472static bool NotWord(uc16 c) { 473 return !IsRegExpWord(c); 474} 475 476 477static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) { 478 Zone zone(Isolate::Current()); 479 ZoneList<CharacterRange>* ranges = 480 new(&zone) ZoneList<CharacterRange>(2, &zone); 481 CharacterRange::AddClassEscape(c, ranges, &zone); 482 for (unsigned i = 0; i < (1 << 16); i++) { 483 bool in_class = false; 484 for (int j = 0; !in_class && j < ranges->length(); j++) { 485 CharacterRange& range = ranges->at(j); 486 in_class = (range.from() <= i && i <= range.to()); 487 } 488 CHECK_EQ(pred(i), in_class); 489 } 490} 491 492 493TEST(CharacterClassEscapes) { 494 v8::internal::V8::Initialize(NULL); 495 TestCharacterClassEscapes('.', IsRegExpNewline); 496 TestCharacterClassEscapes('d', IsDigit); 497 TestCharacterClassEscapes('D', NotDigit); 498 TestCharacterClassEscapes('s', IsWhiteSpace); 499 TestCharacterClassEscapes('S', NotWhiteSpace); 500 TestCharacterClassEscapes('w', IsRegExpWord); 501 TestCharacterClassEscapes('W', NotWord); 502} 503 504 505static RegExpNode* Compile(const char* input, 506 bool multiline, 507 bool is_ascii, 508 Zone* zone) { 509 V8::Initialize(NULL); 510 Isolate* isolate = Isolate::Current(); 511 FlatStringReader reader(isolate, CStrVector(input)); 512 RegExpCompileData compile_data; 513 if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline, 514 &compile_data, zone)) 515 return NULL; 516 Handle<String> pattern = isolate->factory()-> 517 NewStringFromUtf8(CStrVector(input)); 518 Handle<String> sample_subject = 519 isolate->factory()->NewStringFromUtf8(CStrVector("")); 520 RegExpEngine::Compile(&compile_data, 521 false, 522 false, 523 multiline, 524 pattern, 525 sample_subject, 526 is_ascii, 527 zone); 528 return compile_data.node; 529} 530 531 532static void Execute(const char* input, 533 bool multiline, 534 bool is_ascii, 535 bool dot_output = false) { 536 v8::HandleScope scope(v8::Isolate::GetCurrent()); 537 Zone zone(Isolate::Current()); 538 RegExpNode* node = Compile(input, multiline, is_ascii, &zone); 539 USE(node); 540#ifdef DEBUG 541 if (dot_output) { 542 RegExpEngine::DotPrint(input, node, false); 543 exit(0); 544 } 545#endif // DEBUG 546} 547 548 549class TestConfig { 550 public: 551 typedef int Key; 552 typedef int Value; 553 static const int kNoKey; 554 static int NoValue() { return 0; } 555 static inline int Compare(int a, int b) { 556 if (a < b) 557 return -1; 558 else if (a > b) 559 return 1; 560 else 561 return 0; 562 } 563}; 564 565 566const int TestConfig::kNoKey = 0; 567 568 569static unsigned PseudoRandom(int i, int j) { 570 return ~(~((i * 781) ^ (j * 329))); 571} 572 573 574TEST(SplayTreeSimple) { 575 v8::internal::V8::Initialize(NULL); 576 static const unsigned kLimit = 1000; 577 Zone zone(Isolate::Current()); 578 ZoneSplayTree<TestConfig> tree(&zone); 579 bool seen[kLimit]; 580 for (unsigned i = 0; i < kLimit; i++) seen[i] = false; 581#define CHECK_MAPS_EQUAL() do { \ 582 for (unsigned k = 0; k < kLimit; k++) \ 583 CHECK_EQ(seen[k], tree.Find(k, &loc)); \ 584 } while (false) 585 for (int i = 0; i < 50; i++) { 586 for (int j = 0; j < 50; j++) { 587 unsigned next = PseudoRandom(i, j) % kLimit; 588 if (seen[next]) { 589 // We've already seen this one. Check the value and remove 590 // it. 591 ZoneSplayTree<TestConfig>::Locator loc; 592 CHECK(tree.Find(next, &loc)); 593 CHECK_EQ(next, loc.key()); 594 CHECK_EQ(3 * next, loc.value()); 595 tree.Remove(next); 596 seen[next] = false; 597 CHECK_MAPS_EQUAL(); 598 } else { 599 // Check that it wasn't there already and then add it. 600 ZoneSplayTree<TestConfig>::Locator loc; 601 CHECK(!tree.Find(next, &loc)); 602 CHECK(tree.Insert(next, &loc)); 603 CHECK_EQ(next, loc.key()); 604 loc.set_value(3 * next); 605 seen[next] = true; 606 CHECK_MAPS_EQUAL(); 607 } 608 int val = PseudoRandom(j, i) % kLimit; 609 if (seen[val]) { 610 ZoneSplayTree<TestConfig>::Locator loc; 611 CHECK(tree.FindGreatestLessThan(val, &loc)); 612 CHECK_EQ(loc.key(), val); 613 break; 614 } 615 val = PseudoRandom(i + j, i - j) % kLimit; 616 if (seen[val]) { 617 ZoneSplayTree<TestConfig>::Locator loc; 618 CHECK(tree.FindLeastGreaterThan(val, &loc)); 619 CHECK_EQ(loc.key(), val); 620 break; 621 } 622 } 623 } 624} 625 626 627TEST(DispatchTableConstruction) { 628 v8::internal::V8::Initialize(NULL); 629 // Initialize test data. 630 static const int kLimit = 1000; 631 static const int kRangeCount = 8; 632 static const int kRangeSize = 16; 633 uc16 ranges[kRangeCount][2 * kRangeSize]; 634 for (int i = 0; i < kRangeCount; i++) { 635 Vector<uc16> range(ranges[i], 2 * kRangeSize); 636 for (int j = 0; j < 2 * kRangeSize; j++) { 637 range[j] = PseudoRandom(i + 25, j + 87) % kLimit; 638 } 639 range.Sort(); 640 for (int j = 1; j < 2 * kRangeSize; j++) { 641 CHECK(range[j-1] <= range[j]); 642 } 643 } 644 // Enter test data into dispatch table. 645 Zone zone(Isolate::Current()); 646 DispatchTable table(&zone); 647 for (int i = 0; i < kRangeCount; i++) { 648 uc16* range = ranges[i]; 649 for (int j = 0; j < 2 * kRangeSize; j += 2) 650 table.AddRange(CharacterRange(range[j], range[j + 1]), i, &zone); 651 } 652 // Check that the table looks as we would expect 653 for (int p = 0; p < kLimit; p++) { 654 OutSet* outs = table.Get(p); 655 for (int j = 0; j < kRangeCount; j++) { 656 uc16* range = ranges[j]; 657 bool is_on = false; 658 for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2) 659 is_on = (range[k] <= p && p <= range[k + 1]); 660 CHECK_EQ(is_on, outs->Get(j)); 661 } 662 } 663} 664 665 666// Test of debug-only syntax. 667#ifdef DEBUG 668 669TEST(ParsePossessiveRepetition) { 670 bool old_flag_value = FLAG_regexp_possessive_quantifier; 671 672 // Enable possessive quantifier syntax. 673 FLAG_regexp_possessive_quantifier = true; 674 675 CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')"); 676 CHECK_PARSE_EQ("a++", "(# 1 - p 'a')"); 677 CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')"); 678 CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')"); 679 CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')"); 680 681 // Disable possessive quantifier syntax. 682 FLAG_regexp_possessive_quantifier = false; 683 684 CHECK_PARSE_ERROR("a*+"); 685 CHECK_PARSE_ERROR("a++"); 686 CHECK_PARSE_ERROR("a?+"); 687 CHECK_PARSE_ERROR("a{10,20}+"); 688 CHECK_PARSE_ERROR("a{10,20}+b"); 689 690 FLAG_regexp_possessive_quantifier = old_flag_value; 691} 692 693#endif 694 695// Tests of interpreter. 696 697 698#ifndef V8_INTERPRETED_REGEXP 699 700#if V8_TARGET_ARCH_IA32 701typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler; 702#elif V8_TARGET_ARCH_X64 703typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler; 704#elif V8_TARGET_ARCH_ARM 705typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler; 706#elif V8_TARGET_ARCH_MIPS 707typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler; 708#endif 709 710class ContextInitializer { 711 public: 712 ContextInitializer() 713 : scope_(v8::Isolate::GetCurrent()), 714 env_(v8::Context::New(v8::Isolate::GetCurrent())) { 715 env_->Enter(); 716 } 717 ~ContextInitializer() { 718 env_->Exit(); 719 } 720 private: 721 v8::HandleScope scope_; 722 v8::Handle<v8::Context> env_; 723}; 724 725 726static ArchRegExpMacroAssembler::Result Execute(Code* code, 727 String* input, 728 int start_offset, 729 const byte* input_start, 730 const byte* input_end, 731 int* captures) { 732 return NativeRegExpMacroAssembler::Execute( 733 code, 734 input, 735 start_offset, 736 input_start, 737 input_end, 738 captures, 739 0, 740 Isolate::Current()); 741} 742 743 744TEST(MacroAssemblerNativeSuccess) { 745 v8::V8::Initialize(); 746 ContextInitializer initializer; 747 Isolate* isolate = Isolate::Current(); 748 Factory* factory = isolate->factory(); 749 Zone zone(isolate); 750 751 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone); 752 753 m.Succeed(); 754 755 Handle<String> source = factory->NewStringFromAscii(CStrVector("")); 756 Handle<Object> code_object = m.GetCode(source); 757 Handle<Code> code = Handle<Code>::cast(code_object); 758 759 int captures[4] = {42, 37, 87, 117}; 760 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 761 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 762 const byte* start_adr = 763 reinterpret_cast<const byte*>(seq_input->GetCharsAddress()); 764 765 NativeRegExpMacroAssembler::Result result = 766 Execute(*code, 767 *input, 768 0, 769 start_adr, 770 start_adr + seq_input->length(), 771 captures); 772 773 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 774 CHECK_EQ(-1, captures[0]); 775 CHECK_EQ(-1, captures[1]); 776 CHECK_EQ(-1, captures[2]); 777 CHECK_EQ(-1, captures[3]); 778} 779 780 781TEST(MacroAssemblerNativeSimple) { 782 v8::V8::Initialize(); 783 ContextInitializer initializer; 784 Isolate* isolate = Isolate::Current(); 785 Factory* factory = isolate->factory(); 786 Zone zone(isolate); 787 788 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone); 789 790 Label fail, backtrack; 791 m.PushBacktrack(&fail); 792 m.CheckNotAtStart(NULL); 793 m.LoadCurrentCharacter(2, NULL); 794 m.CheckNotCharacter('o', NULL); 795 m.LoadCurrentCharacter(1, NULL, false); 796 m.CheckNotCharacter('o', NULL); 797 m.LoadCurrentCharacter(0, NULL, false); 798 m.CheckNotCharacter('f', NULL); 799 m.WriteCurrentPositionToRegister(0, 0); 800 m.WriteCurrentPositionToRegister(1, 3); 801 m.AdvanceCurrentPosition(3); 802 m.PushBacktrack(&backtrack); 803 m.Succeed(); 804 m.Bind(&backtrack); 805 m.Backtrack(); 806 m.Bind(&fail); 807 m.Fail(); 808 809 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo")); 810 Handle<Object> code_object = m.GetCode(source); 811 Handle<Code> code = Handle<Code>::cast(code_object); 812 813 int captures[4] = {42, 37, 87, 117}; 814 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 815 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 816 Address start_adr = seq_input->GetCharsAddress(); 817 818 NativeRegExpMacroAssembler::Result result = 819 Execute(*code, 820 *input, 821 0, 822 start_adr, 823 start_adr + input->length(), 824 captures); 825 826 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 827 CHECK_EQ(0, captures[0]); 828 CHECK_EQ(3, captures[1]); 829 CHECK_EQ(-1, captures[2]); 830 CHECK_EQ(-1, captures[3]); 831 832 input = factory->NewStringFromAscii(CStrVector("barbarbar")); 833 seq_input = Handle<SeqOneByteString>::cast(input); 834 start_adr = seq_input->GetCharsAddress(); 835 836 result = Execute(*code, 837 *input, 838 0, 839 start_adr, 840 start_adr + input->length(), 841 captures); 842 843 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 844} 845 846 847TEST(MacroAssemblerNativeSimpleUC16) { 848 v8::V8::Initialize(); 849 ContextInitializer initializer; 850 Isolate* isolate = Isolate::Current(); 851 Factory* factory = isolate->factory(); 852 Zone zone(isolate); 853 854 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone); 855 856 Label fail, backtrack; 857 m.PushBacktrack(&fail); 858 m.CheckNotAtStart(NULL); 859 m.LoadCurrentCharacter(2, NULL); 860 m.CheckNotCharacter('o', NULL); 861 m.LoadCurrentCharacter(1, NULL, false); 862 m.CheckNotCharacter('o', NULL); 863 m.LoadCurrentCharacter(0, NULL, false); 864 m.CheckNotCharacter('f', NULL); 865 m.WriteCurrentPositionToRegister(0, 0); 866 m.WriteCurrentPositionToRegister(1, 3); 867 m.AdvanceCurrentPosition(3); 868 m.PushBacktrack(&backtrack); 869 m.Succeed(); 870 m.Bind(&backtrack); 871 m.Backtrack(); 872 m.Bind(&fail); 873 m.Fail(); 874 875 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo")); 876 Handle<Object> code_object = m.GetCode(source); 877 Handle<Code> code = Handle<Code>::cast(code_object); 878 879 int captures[4] = {42, 37, 87, 117}; 880 const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o', 881 static_cast<uc16>(0x2603)}; 882 Handle<String> input = 883 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 884 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 885 Address start_adr = seq_input->GetCharsAddress(); 886 887 NativeRegExpMacroAssembler::Result result = 888 Execute(*code, 889 *input, 890 0, 891 start_adr, 892 start_adr + input->length(), 893 captures); 894 895 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 896 CHECK_EQ(0, captures[0]); 897 CHECK_EQ(3, captures[1]); 898 CHECK_EQ(-1, captures[2]); 899 CHECK_EQ(-1, captures[3]); 900 901 const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', 902 static_cast<uc16>(0x2603)}; 903 input = factory->NewStringFromTwoByte(Vector<const uc16>(input_data2, 9)); 904 seq_input = Handle<SeqTwoByteString>::cast(input); 905 start_adr = seq_input->GetCharsAddress(); 906 907 result = Execute(*code, 908 *input, 909 0, 910 start_adr, 911 start_adr + input->length() * 2, 912 captures); 913 914 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 915} 916 917 918TEST(MacroAssemblerNativeBacktrack) { 919 v8::V8::Initialize(); 920 ContextInitializer initializer; 921 Isolate* isolate = Isolate::Current(); 922 Factory* factory = isolate->factory(); 923 Zone zone(isolate); 924 925 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone); 926 927 Label fail; 928 Label backtrack; 929 m.LoadCurrentCharacter(10, &fail); 930 m.Succeed(); 931 m.Bind(&fail); 932 m.PushBacktrack(&backtrack); 933 m.LoadCurrentCharacter(10, NULL); 934 m.Succeed(); 935 m.Bind(&backtrack); 936 m.Fail(); 937 938 Handle<String> source = factory->NewStringFromAscii(CStrVector("..........")); 939 Handle<Object> code_object = m.GetCode(source); 940 Handle<Code> code = Handle<Code>::cast(code_object); 941 942 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 943 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 944 Address start_adr = seq_input->GetCharsAddress(); 945 946 NativeRegExpMacroAssembler::Result result = 947 Execute(*code, 948 *input, 949 0, 950 start_adr, 951 start_adr + input->length(), 952 NULL); 953 954 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 955} 956 957 958TEST(MacroAssemblerNativeBackReferenceASCII) { 959 v8::V8::Initialize(); 960 ContextInitializer initializer; 961 Isolate* isolate = Isolate::Current(); 962 Factory* factory = isolate->factory(); 963 Zone zone(isolate); 964 965 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone); 966 967 m.WriteCurrentPositionToRegister(0, 0); 968 m.AdvanceCurrentPosition(2); 969 m.WriteCurrentPositionToRegister(1, 0); 970 Label nomatch; 971 m.CheckNotBackReference(0, &nomatch); 972 m.Fail(); 973 m.Bind(&nomatch); 974 m.AdvanceCurrentPosition(2); 975 Label missing_match; 976 m.CheckNotBackReference(0, &missing_match); 977 m.WriteCurrentPositionToRegister(2, 0); 978 m.Succeed(); 979 m.Bind(&missing_match); 980 m.Fail(); 981 982 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1")); 983 Handle<Object> code_object = m.GetCode(source); 984 Handle<Code> code = Handle<Code>::cast(code_object); 985 986 Handle<String> input = factory->NewStringFromAscii(CStrVector("fooofo")); 987 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 988 Address start_adr = seq_input->GetCharsAddress(); 989 990 int output[4]; 991 NativeRegExpMacroAssembler::Result result = 992 Execute(*code, 993 *input, 994 0, 995 start_adr, 996 start_adr + input->length(), 997 output); 998 999 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1000 CHECK_EQ(0, output[0]); 1001 CHECK_EQ(2, output[1]); 1002 CHECK_EQ(6, output[2]); 1003 CHECK_EQ(-1, output[3]); 1004} 1005 1006 1007TEST(MacroAssemblerNativeBackReferenceUC16) { 1008 v8::V8::Initialize(); 1009 ContextInitializer initializer; 1010 Isolate* isolate = Isolate::Current(); 1011 Factory* factory = isolate->factory(); 1012 Zone zone(isolate); 1013 1014 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone); 1015 1016 m.WriteCurrentPositionToRegister(0, 0); 1017 m.AdvanceCurrentPosition(2); 1018 m.WriteCurrentPositionToRegister(1, 0); 1019 Label nomatch; 1020 m.CheckNotBackReference(0, &nomatch); 1021 m.Fail(); 1022 m.Bind(&nomatch); 1023 m.AdvanceCurrentPosition(2); 1024 Label missing_match; 1025 m.CheckNotBackReference(0, &missing_match); 1026 m.WriteCurrentPositionToRegister(2, 0); 1027 m.Succeed(); 1028 m.Bind(&missing_match); 1029 m.Fail(); 1030 1031 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1")); 1032 Handle<Object> code_object = m.GetCode(source); 1033 Handle<Code> code = Handle<Code>::cast(code_object); 1034 1035 const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028}; 1036 Handle<String> input = 1037 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 1038 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 1039 Address start_adr = seq_input->GetCharsAddress(); 1040 1041 int output[4]; 1042 NativeRegExpMacroAssembler::Result result = 1043 Execute(*code, 1044 *input, 1045 0, 1046 start_adr, 1047 start_adr + input->length() * 2, 1048 output); 1049 1050 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1051 CHECK_EQ(0, output[0]); 1052 CHECK_EQ(2, output[1]); 1053 CHECK_EQ(6, output[2]); 1054 CHECK_EQ(-1, output[3]); 1055} 1056 1057 1058 1059TEST(MacroAssemblernativeAtStart) { 1060 v8::V8::Initialize(); 1061 ContextInitializer initializer; 1062 Isolate* isolate = Isolate::Current(); 1063 Factory* factory = isolate->factory(); 1064 Zone zone(isolate); 1065 1066 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone); 1067 1068 Label not_at_start, newline, fail; 1069 m.CheckNotAtStart(¬_at_start); 1070 // Check that prevchar = '\n' and current = 'f'. 1071 m.CheckCharacter('\n', &newline); 1072 m.Bind(&fail); 1073 m.Fail(); 1074 m.Bind(&newline); 1075 m.LoadCurrentCharacter(0, &fail); 1076 m.CheckNotCharacter('f', &fail); 1077 m.Succeed(); 1078 1079 m.Bind(¬_at_start); 1080 // Check that prevchar = 'o' and current = 'b'. 1081 Label prevo; 1082 m.CheckCharacter('o', &prevo); 1083 m.Fail(); 1084 m.Bind(&prevo); 1085 m.LoadCurrentCharacter(0, &fail); 1086 m.CheckNotCharacter('b', &fail); 1087 m.Succeed(); 1088 1089 Handle<String> source = factory->NewStringFromAscii(CStrVector("(^f|ob)")); 1090 Handle<Object> code_object = m.GetCode(source); 1091 Handle<Code> code = Handle<Code>::cast(code_object); 1092 1093 Handle<String> input = factory->NewStringFromAscii(CStrVector("foobar")); 1094 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 1095 Address start_adr = seq_input->GetCharsAddress(); 1096 1097 NativeRegExpMacroAssembler::Result result = 1098 Execute(*code, 1099 *input, 1100 0, 1101 start_adr, 1102 start_adr + input->length(), 1103 NULL); 1104 1105 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1106 1107 result = Execute(*code, 1108 *input, 1109 3, 1110 start_adr + 3, 1111 start_adr + input->length(), 1112 NULL); 1113 1114 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1115} 1116 1117 1118TEST(MacroAssemblerNativeBackRefNoCase) { 1119 v8::V8::Initialize(); 1120 ContextInitializer initializer; 1121 Isolate* isolate = Isolate::Current(); 1122 Factory* factory = isolate->factory(); 1123 Zone zone(isolate); 1124 1125 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone); 1126 1127 Label fail, succ; 1128 1129 m.WriteCurrentPositionToRegister(0, 0); 1130 m.WriteCurrentPositionToRegister(2, 0); 1131 m.AdvanceCurrentPosition(3); 1132 m.WriteCurrentPositionToRegister(3, 0); 1133 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "AbC". 1134 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "ABC". 1135 Label expected_fail; 1136 m.CheckNotBackReferenceIgnoreCase(2, &expected_fail); 1137 m.Bind(&fail); 1138 m.Fail(); 1139 1140 m.Bind(&expected_fail); 1141 m.AdvanceCurrentPosition(3); // Skip "xYz" 1142 m.CheckNotBackReferenceIgnoreCase(2, &succ); 1143 m.Fail(); 1144 1145 m.Bind(&succ); 1146 m.WriteCurrentPositionToRegister(1, 0); 1147 m.Succeed(); 1148 1149 Handle<String> source = 1150 factory->NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)")); 1151 Handle<Object> code_object = m.GetCode(source); 1152 Handle<Code> code = Handle<Code>::cast(code_object); 1153 1154 Handle<String> input = 1155 factory->NewStringFromAscii(CStrVector("aBcAbCABCxYzab")); 1156 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 1157 Address start_adr = seq_input->GetCharsAddress(); 1158 1159 int output[4]; 1160 NativeRegExpMacroAssembler::Result result = 1161 Execute(*code, 1162 *input, 1163 0, 1164 start_adr, 1165 start_adr + input->length(), 1166 output); 1167 1168 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1169 CHECK_EQ(0, output[0]); 1170 CHECK_EQ(12, output[1]); 1171 CHECK_EQ(0, output[2]); 1172 CHECK_EQ(3, output[3]); 1173} 1174 1175 1176 1177TEST(MacroAssemblerNativeRegisters) { 1178 v8::V8::Initialize(); 1179 ContextInitializer initializer; 1180 Isolate* isolate = Isolate::Current(); 1181 Factory* factory = isolate->factory(); 1182 Zone zone(isolate); 1183 1184 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6, &zone); 1185 1186 uc16 foo_chars[3] = {'f', 'o', 'o'}; 1187 Vector<const uc16> foo(foo_chars, 3); 1188 1189 enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt }; 1190 Label fail; 1191 Label backtrack; 1192 m.WriteCurrentPositionToRegister(out1, 0); // Output: [0] 1193 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1194 m.PushBacktrack(&backtrack); 1195 m.WriteStackPointerToRegister(sp); 1196 // Fill stack and registers 1197 m.AdvanceCurrentPosition(2); 1198 m.WriteCurrentPositionToRegister(out1, 0); 1199 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1200 m.PushBacktrack(&fail); 1201 // Drop backtrack stack frames. 1202 m.ReadStackPointerFromRegister(sp); 1203 // And take the first backtrack (to &backtrack) 1204 m.Backtrack(); 1205 1206 m.PushCurrentPosition(); 1207 m.AdvanceCurrentPosition(2); 1208 m.PopCurrentPosition(); 1209 1210 m.Bind(&backtrack); 1211 m.PopRegister(out1); 1212 m.ReadCurrentPositionFromRegister(out1); 1213 m.AdvanceCurrentPosition(3); 1214 m.WriteCurrentPositionToRegister(out2, 0); // [0,3] 1215 1216 Label loop; 1217 m.SetRegister(loop_cnt, 0); // loop counter 1218 m.Bind(&loop); 1219 m.AdvanceRegister(loop_cnt, 1); 1220 m.AdvanceCurrentPosition(1); 1221 m.IfRegisterLT(loop_cnt, 3, &loop); 1222 m.WriteCurrentPositionToRegister(out3, 0); // [0,3,6] 1223 1224 Label loop2; 1225 m.SetRegister(loop_cnt, 2); // loop counter 1226 m.Bind(&loop2); 1227 m.AdvanceRegister(loop_cnt, -1); 1228 m.AdvanceCurrentPosition(1); 1229 m.IfRegisterGE(loop_cnt, 0, &loop2); 1230 m.WriteCurrentPositionToRegister(out4, 0); // [0,3,6,9] 1231 1232 Label loop3; 1233 Label exit_loop3; 1234 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1235 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1236 m.ReadCurrentPositionFromRegister(out3); 1237 m.Bind(&loop3); 1238 m.AdvanceCurrentPosition(1); 1239 m.CheckGreedyLoop(&exit_loop3); 1240 m.GoTo(&loop3); 1241 m.Bind(&exit_loop3); 1242 m.PopCurrentPosition(); 1243 m.WriteCurrentPositionToRegister(out5, 0); // [0,3,6,9,9,-1] 1244 1245 m.Succeed(); 1246 1247 m.Bind(&fail); 1248 m.Fail(); 1249 1250 Handle<String> source = 1251 factory->NewStringFromAscii(CStrVector("<loop test>")); 1252 Handle<Object> code_object = m.GetCode(source); 1253 Handle<Code> code = Handle<Code>::cast(code_object); 1254 1255 // String long enough for test (content doesn't matter). 1256 Handle<String> input = 1257 factory->NewStringFromAscii(CStrVector("foofoofoofoofoo")); 1258 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 1259 Address start_adr = seq_input->GetCharsAddress(); 1260 1261 int output[6]; 1262 NativeRegExpMacroAssembler::Result result = 1263 Execute(*code, 1264 *input, 1265 0, 1266 start_adr, 1267 start_adr + input->length(), 1268 output); 1269 1270 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1271 CHECK_EQ(0, output[0]); 1272 CHECK_EQ(3, output[1]); 1273 CHECK_EQ(6, output[2]); 1274 CHECK_EQ(9, output[3]); 1275 CHECK_EQ(9, output[4]); 1276 CHECK_EQ(-1, output[5]); 1277} 1278 1279 1280TEST(MacroAssemblerStackOverflow) { 1281 v8::V8::Initialize(); 1282 ContextInitializer initializer; 1283 Isolate* isolate = Isolate::Current(); 1284 Factory* factory = isolate->factory(); 1285 Zone zone(isolate); 1286 1287 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone); 1288 1289 Label loop; 1290 m.Bind(&loop); 1291 m.PushBacktrack(&loop); 1292 m.GoTo(&loop); 1293 1294 Handle<String> source = 1295 factory->NewStringFromAscii(CStrVector("<stack overflow test>")); 1296 Handle<Object> code_object = m.GetCode(source); 1297 Handle<Code> code = Handle<Code>::cast(code_object); 1298 1299 // String long enough for test (content doesn't matter). 1300 Handle<String> input = 1301 factory->NewStringFromAscii(CStrVector("dummy")); 1302 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 1303 Address start_adr = seq_input->GetCharsAddress(); 1304 1305 NativeRegExpMacroAssembler::Result result = 1306 Execute(*code, 1307 *input, 1308 0, 1309 start_adr, 1310 start_adr + input->length(), 1311 NULL); 1312 1313 CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result); 1314 CHECK(isolate->has_pending_exception()); 1315 isolate->clear_pending_exception(); 1316} 1317 1318 1319TEST(MacroAssemblerNativeLotsOfRegisters) { 1320 v8::V8::Initialize(); 1321 ContextInitializer initializer; 1322 Isolate* isolate = Isolate::Current(); 1323 Factory* factory = isolate->factory(); 1324 Zone zone(isolate); 1325 1326 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2, &zone); 1327 1328 // At least 2048, to ensure the allocated space for registers 1329 // span one full page. 1330 const int large_number = 8000; 1331 m.WriteCurrentPositionToRegister(large_number, 42); 1332 m.WriteCurrentPositionToRegister(0, 0); 1333 m.WriteCurrentPositionToRegister(1, 1); 1334 Label done; 1335 m.CheckNotBackReference(0, &done); // Performs a system-stack push. 1336 m.Bind(&done); 1337 m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck); 1338 m.PopRegister(1); 1339 m.Succeed(); 1340 1341 Handle<String> source = 1342 factory->NewStringFromAscii(CStrVector("<huge register space test>")); 1343 Handle<Object> code_object = m.GetCode(source); 1344 Handle<Code> code = Handle<Code>::cast(code_object); 1345 1346 // String long enough for test (content doesn't matter). 1347 Handle<String> input = 1348 factory->NewStringFromAscii(CStrVector("sample text")); 1349 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input); 1350 Address start_adr = seq_input->GetCharsAddress(); 1351 1352 int captures[2]; 1353 NativeRegExpMacroAssembler::Result result = 1354 Execute(*code, 1355 *input, 1356 0, 1357 start_adr, 1358 start_adr + input->length(), 1359 captures); 1360 1361 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1362 CHECK_EQ(0, captures[0]); 1363 CHECK_EQ(42, captures[1]); 1364 1365 isolate->clear_pending_exception(); 1366} 1367 1368#else // V8_INTERPRETED_REGEXP 1369 1370TEST(MacroAssembler) { 1371 V8::Initialize(NULL); 1372 byte codes[1024]; 1373 Zone zone(Isolate::Current()); 1374 RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024), &zone); 1375 // ^f(o)o. 1376 Label start, fail, backtrack; 1377 1378 m.SetRegister(4, 42); 1379 m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck); 1380 m.AdvanceRegister(4, 42); 1381 m.GoTo(&start); 1382 m.Fail(); 1383 m.Bind(&start); 1384 m.PushBacktrack(&fail); 1385 m.CheckNotAtStart(NULL); 1386 m.LoadCurrentCharacter(0, NULL); 1387 m.CheckNotCharacter('f', NULL); 1388 m.LoadCurrentCharacter(1, NULL); 1389 m.CheckNotCharacter('o', NULL); 1390 m.LoadCurrentCharacter(2, NULL); 1391 m.CheckNotCharacter('o', NULL); 1392 m.WriteCurrentPositionToRegister(0, 0); 1393 m.WriteCurrentPositionToRegister(1, 3); 1394 m.WriteCurrentPositionToRegister(2, 1); 1395 m.WriteCurrentPositionToRegister(3, 2); 1396 m.AdvanceCurrentPosition(3); 1397 m.PushBacktrack(&backtrack); 1398 m.Succeed(); 1399 m.Bind(&backtrack); 1400 m.ClearRegisters(2, 3); 1401 m.Backtrack(); 1402 m.Bind(&fail); 1403 m.PopRegister(0); 1404 m.Fail(); 1405 1406 Isolate* isolate = Isolate::Current(); 1407 Factory* factory = isolate->factory(); 1408 HandleScope scope(isolate); 1409 1410 Handle<String> source = factory->NewStringFromAscii(CStrVector("^f(o)o")); 1411 Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source)); 1412 int captures[5]; 1413 1414 const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'}; 1415 Handle<String> f1_16 = 1416 factory->NewStringFromTwoByte(Vector<const uc16>(str1, 6)); 1417 1418 CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0)); 1419 CHECK_EQ(0, captures[0]); 1420 CHECK_EQ(3, captures[1]); 1421 CHECK_EQ(1, captures[2]); 1422 CHECK_EQ(2, captures[3]); 1423 CHECK_EQ(84, captures[4]); 1424 1425 const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'}; 1426 Handle<String> f2_16 = 1427 factory->NewStringFromTwoByte(Vector<const uc16>(str2, 6)); 1428 1429 CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0)); 1430 CHECK_EQ(42, captures[0]); 1431} 1432 1433#endif // V8_INTERPRETED_REGEXP 1434 1435 1436TEST(AddInverseToTable) { 1437 v8::internal::V8::Initialize(NULL); 1438 static const int kLimit = 1000; 1439 static const int kRangeCount = 16; 1440 for (int t = 0; t < 10; t++) { 1441 Zone zone(Isolate::Current()); 1442 ZoneList<CharacterRange>* ranges = 1443 new(&zone) ZoneList<CharacterRange>(kRangeCount, &zone); 1444 for (int i = 0; i < kRangeCount; i++) { 1445 int from = PseudoRandom(t + 87, i + 25) % kLimit; 1446 int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20)); 1447 if (to > kLimit) to = kLimit; 1448 ranges->Add(CharacterRange(from, to), &zone); 1449 } 1450 DispatchTable table(&zone); 1451 DispatchTableConstructor cons(&table, false, &zone); 1452 cons.set_choice_index(0); 1453 cons.AddInverse(ranges); 1454 for (int i = 0; i < kLimit; i++) { 1455 bool is_on = false; 1456 for (int j = 0; !is_on && j < kRangeCount; j++) 1457 is_on = ranges->at(j).Contains(i); 1458 OutSet* set = table.Get(i); 1459 CHECK_EQ(is_on, set->Get(0) == false); 1460 } 1461 } 1462 Zone zone(Isolate::Current()); 1463 ZoneList<CharacterRange>* ranges = 1464 new(&zone) ZoneList<CharacterRange>(1, &zone); 1465 ranges->Add(CharacterRange(0xFFF0, 0xFFFE), &zone); 1466 DispatchTable table(&zone); 1467 DispatchTableConstructor cons(&table, false, &zone); 1468 cons.set_choice_index(0); 1469 cons.AddInverse(ranges); 1470 CHECK(!table.Get(0xFFFE)->Get(0)); 1471 CHECK(table.Get(0xFFFF)->Get(0)); 1472} 1473 1474 1475static uc32 canonicalize(uc32 c) { 1476 unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth]; 1477 int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL); 1478 if (count == 0) { 1479 return c; 1480 } else { 1481 CHECK_EQ(1, count); 1482 return canon[0]; 1483 } 1484} 1485 1486 1487TEST(LatinCanonicalize) { 1488 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1489 for (char lower = 'a'; lower <= 'z'; lower++) { 1490 char upper = lower + ('A' - 'a'); 1491 CHECK_EQ(canonicalize(lower), canonicalize(upper)); 1492 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1493 int length = un_canonicalize.get(lower, '\0', uncanon); 1494 CHECK_EQ(2, length); 1495 CHECK_EQ(upper, uncanon[0]); 1496 CHECK_EQ(lower, uncanon[1]); 1497 } 1498 for (uc32 c = 128; c < (1 << 21); c++) 1499 CHECK_GE(canonicalize(c), 128); 1500 unibrow::Mapping<unibrow::ToUppercase> to_upper; 1501 // Canonicalization is only defined for the Basic Multilingual Plane. 1502 for (uc32 c = 0; c < (1 << 16); c++) { 1503 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth]; 1504 int length = to_upper.get(c, '\0', upper); 1505 if (length == 0) { 1506 length = 1; 1507 upper[0] = c; 1508 } 1509 uc32 u = upper[0]; 1510 if (length > 1 || (c >= 128 && u < 128)) 1511 u = c; 1512 CHECK_EQ(u, canonicalize(c)); 1513 } 1514} 1515 1516 1517static uc32 CanonRangeEnd(uc32 c) { 1518 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; 1519 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); 1520 if (count == 0) { 1521 return c; 1522 } else { 1523 CHECK_EQ(1, count); 1524 return canon[0]; 1525 } 1526} 1527 1528 1529TEST(RangeCanonicalization) { 1530 // Check that we arrive at the same result when using the basic 1531 // range canonicalization primitives as when using immediate 1532 // canonicalization. 1533 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1534 int block_start = 0; 1535 while (block_start <= 0xFFFF) { 1536 uc32 block_end = CanonRangeEnd(block_start); 1537 unsigned block_length = block_end - block_start + 1; 1538 if (block_length > 1) { 1539 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1540 int first_length = un_canonicalize.get(block_start, '\0', first); 1541 for (unsigned i = 1; i < block_length; i++) { 1542 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1543 int succ_length = un_canonicalize.get(block_start + i, '\0', succ); 1544 CHECK_EQ(first_length, succ_length); 1545 for (int j = 0; j < succ_length; j++) { 1546 int calc = first[j] + i; 1547 int found = succ[j]; 1548 CHECK_EQ(calc, found); 1549 } 1550 } 1551 } 1552 block_start = block_start + block_length; 1553 } 1554} 1555 1556 1557TEST(UncanonicalizeEquivalence) { 1558 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1559 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1560 for (int i = 0; i < (1 << 16); i++) { 1561 int length = un_canonicalize.get(i, '\0', chars); 1562 for (int j = 0; j < length; j++) { 1563 unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1564 int length2 = un_canonicalize.get(chars[j], '\0', chars2); 1565 CHECK_EQ(length, length2); 1566 for (int k = 0; k < length; k++) 1567 CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k])); 1568 } 1569 } 1570} 1571 1572 1573static void TestRangeCaseIndependence(CharacterRange input, 1574 Vector<CharacterRange> expected) { 1575 Zone zone(Isolate::Current()); 1576 int count = expected.length(); 1577 ZoneList<CharacterRange>* list = 1578 new(&zone) ZoneList<CharacterRange>(count, &zone); 1579 input.AddCaseEquivalents(list, false, &zone); 1580 CHECK_EQ(count, list->length()); 1581 for (int i = 0; i < list->length(); i++) { 1582 CHECK_EQ(expected[i].from(), list->at(i).from()); 1583 CHECK_EQ(expected[i].to(), list->at(i).to()); 1584 } 1585} 1586 1587 1588static void TestSimpleRangeCaseIndependence(CharacterRange input, 1589 CharacterRange expected) { 1590 EmbeddedVector<CharacterRange, 1> vector; 1591 vector[0] = expected; 1592 TestRangeCaseIndependence(input, vector); 1593} 1594 1595 1596TEST(CharacterRangeCaseIndependence) { 1597 v8::internal::V8::Initialize(NULL); 1598 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'), 1599 CharacterRange::Singleton('A')); 1600 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'), 1601 CharacterRange::Singleton('Z')); 1602 TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'), 1603 CharacterRange('A', 'Z')); 1604 TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'), 1605 CharacterRange('C', 'F')); 1606 TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'), 1607 CharacterRange('A', 'B')); 1608 TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'), 1609 CharacterRange('Y', 'Z')); 1610 TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1), 1611 CharacterRange('A', 'Z')); 1612 TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'), 1613 CharacterRange('a', 'z')); 1614 TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'), 1615 CharacterRange('c', 'f')); 1616 TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1), 1617 CharacterRange('a', 'z')); 1618 // Here we need to add [l-z] to complete the case independence of 1619 // [A-Za-z] but we expect [a-z] to be added since we always add a 1620 // whole block at a time. 1621 TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'), 1622 CharacterRange('a', 'z')); 1623} 1624 1625 1626static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) { 1627 if (ranges == NULL) 1628 return false; 1629 for (int i = 0; i < ranges->length(); i++) { 1630 CharacterRange range = ranges->at(i); 1631 if (range.from() <= c && c <= range.to()) 1632 return true; 1633 } 1634 return false; 1635} 1636 1637 1638TEST(CharClassDifference) { 1639 v8::internal::V8::Initialize(NULL); 1640 Zone zone(Isolate::Current()); 1641 ZoneList<CharacterRange>* base = 1642 new(&zone) ZoneList<CharacterRange>(1, &zone); 1643 base->Add(CharacterRange::Everything(), &zone); 1644 Vector<const int> overlay = CharacterRange::GetWordBounds(); 1645 ZoneList<CharacterRange>* included = NULL; 1646 ZoneList<CharacterRange>* excluded = NULL; 1647 CharacterRange::Split(base, overlay, &included, &excluded, &zone); 1648 for (int i = 0; i < (1 << 16); i++) { 1649 bool in_base = InClass(i, base); 1650 if (in_base) { 1651 bool in_overlay = false; 1652 for (int j = 0; !in_overlay && j < overlay.length(); j += 2) { 1653 if (overlay[j] <= i && i < overlay[j+1]) 1654 in_overlay = true; 1655 } 1656 CHECK_EQ(in_overlay, InClass(i, included)); 1657 CHECK_EQ(!in_overlay, InClass(i, excluded)); 1658 } else { 1659 CHECK(!InClass(i, included)); 1660 CHECK(!InClass(i, excluded)); 1661 } 1662 } 1663} 1664 1665 1666TEST(CanonicalizeCharacterSets) { 1667 v8::internal::V8::Initialize(NULL); 1668 Zone zone(Isolate::Current()); 1669 ZoneList<CharacterRange>* list = 1670 new(&zone) ZoneList<CharacterRange>(4, &zone); 1671 CharacterSet set(list); 1672 1673 list->Add(CharacterRange(10, 20), &zone); 1674 list->Add(CharacterRange(30, 40), &zone); 1675 list->Add(CharacterRange(50, 60), &zone); 1676 set.Canonicalize(); 1677 ASSERT_EQ(3, list->length()); 1678 ASSERT_EQ(10, list->at(0).from()); 1679 ASSERT_EQ(20, list->at(0).to()); 1680 ASSERT_EQ(30, list->at(1).from()); 1681 ASSERT_EQ(40, list->at(1).to()); 1682 ASSERT_EQ(50, list->at(2).from()); 1683 ASSERT_EQ(60, list->at(2).to()); 1684 1685 list->Rewind(0); 1686 list->Add(CharacterRange(10, 20), &zone); 1687 list->Add(CharacterRange(50, 60), &zone); 1688 list->Add(CharacterRange(30, 40), &zone); 1689 set.Canonicalize(); 1690 ASSERT_EQ(3, list->length()); 1691 ASSERT_EQ(10, list->at(0).from()); 1692 ASSERT_EQ(20, list->at(0).to()); 1693 ASSERT_EQ(30, list->at(1).from()); 1694 ASSERT_EQ(40, list->at(1).to()); 1695 ASSERT_EQ(50, list->at(2).from()); 1696 ASSERT_EQ(60, list->at(2).to()); 1697 1698 list->Rewind(0); 1699 list->Add(CharacterRange(30, 40), &zone); 1700 list->Add(CharacterRange(10, 20), &zone); 1701 list->Add(CharacterRange(25, 25), &zone); 1702 list->Add(CharacterRange(100, 100), &zone); 1703 list->Add(CharacterRange(1, 1), &zone); 1704 set.Canonicalize(); 1705 ASSERT_EQ(5, list->length()); 1706 ASSERT_EQ(1, list->at(0).from()); 1707 ASSERT_EQ(1, list->at(0).to()); 1708 ASSERT_EQ(10, list->at(1).from()); 1709 ASSERT_EQ(20, list->at(1).to()); 1710 ASSERT_EQ(25, list->at(2).from()); 1711 ASSERT_EQ(25, list->at(2).to()); 1712 ASSERT_EQ(30, list->at(3).from()); 1713 ASSERT_EQ(40, list->at(3).to()); 1714 ASSERT_EQ(100, list->at(4).from()); 1715 ASSERT_EQ(100, list->at(4).to()); 1716 1717 list->Rewind(0); 1718 list->Add(CharacterRange(10, 19), &zone); 1719 list->Add(CharacterRange(21, 30), &zone); 1720 list->Add(CharacterRange(20, 20), &zone); 1721 set.Canonicalize(); 1722 ASSERT_EQ(1, list->length()); 1723 ASSERT_EQ(10, list->at(0).from()); 1724 ASSERT_EQ(30, list->at(0).to()); 1725} 1726 1727 1728TEST(CharacterRangeMerge) { 1729 v8::internal::V8::Initialize(NULL); 1730 Zone zone(Isolate::Current()); 1731 ZoneList<CharacterRange> l1(4, &zone); 1732 ZoneList<CharacterRange> l2(4, &zone); 1733 // Create all combinations of intersections of ranges, both singletons and 1734 // longer. 1735 1736 int offset = 0; 1737 1738 // The five kinds of singleton intersections: 1739 // X 1740 // Y - outside before 1741 // Y - outside touching start 1742 // Y - overlap 1743 // Y - outside touching end 1744 // Y - outside after 1745 1746 for (int i = 0; i < 5; i++) { 1747 l1.Add(CharacterRange::Singleton(offset + 2), &zone); 1748 l2.Add(CharacterRange::Singleton(offset + i), &zone); 1749 offset += 6; 1750 } 1751 1752 // The seven kinds of singleton/non-singleton intersections: 1753 // XXX 1754 // Y - outside before 1755 // Y - outside touching start 1756 // Y - inside touching start 1757 // Y - entirely inside 1758 // Y - inside touching end 1759 // Y - outside touching end 1760 // Y - disjoint after 1761 1762 for (int i = 0; i < 7; i++) { 1763 l1.Add(CharacterRange::Range(offset + 2, offset + 4), &zone); 1764 l2.Add(CharacterRange::Singleton(offset + i), &zone); 1765 offset += 8; 1766 } 1767 1768 // The eleven kinds of non-singleton intersections: 1769 // 1770 // XXXXXXXX 1771 // YYYY - outside before. 1772 // YYYY - outside touching start. 1773 // YYYY - overlapping start 1774 // YYYY - inside touching start 1775 // YYYY - entirely inside 1776 // YYYY - inside touching end 1777 // YYYY - overlapping end 1778 // YYYY - outside touching end 1779 // YYYY - outside after 1780 // YYYYYYYY - identical 1781 // YYYYYYYYYYYY - containing entirely. 1782 1783 for (int i = 0; i < 9; i++) { 1784 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone); // Length 8. 1785 l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3), &zone); 1786 offset += 22; 1787 } 1788 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone); 1789 l2.Add(CharacterRange::Range(offset + 6, offset + 15), &zone); 1790 offset += 22; 1791 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone); 1792 l2.Add(CharacterRange::Range(offset + 4, offset + 17), &zone); 1793 offset += 22; 1794 1795 // Different kinds of multi-range overlap: 1796 // XXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXX 1797 // YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y 1798 1799 l1.Add(CharacterRange::Range(offset, offset + 21), &zone); 1800 l1.Add(CharacterRange::Range(offset + 31, offset + 52), &zone); 1801 for (int i = 0; i < 6; i++) { 1802 l2.Add(CharacterRange::Range(offset + 2, offset + 5), &zone); 1803 l2.Add(CharacterRange::Singleton(offset + 8), &zone); 1804 offset += 9; 1805 } 1806 1807 ASSERT(CharacterRange::IsCanonical(&l1)); 1808 ASSERT(CharacterRange::IsCanonical(&l2)); 1809 1810 ZoneList<CharacterRange> first_only(4, &zone); 1811 ZoneList<CharacterRange> second_only(4, &zone); 1812 ZoneList<CharacterRange> both(4, &zone); 1813} 1814 1815 1816TEST(Graph) { 1817 V8::Initialize(NULL); 1818 Execute("\\b\\w+\\b", false, true, true); 1819} 1820