1// Copyright 2012 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28
29#include <stdlib.h>
30
31#include "src/v8.h"
32
33#include "src/ast.h"
34#include "src/char-predicates-inl.h"
35#include "src/jsregexp.h"
36#include "src/parser.h"
37#include "src/regexp-macro-assembler-irregexp.h"
38#include "src/regexp-macro-assembler.h"
39#include "src/string-stream.h"
40#include "src/zone-inl.h"
41#ifdef V8_INTERPRETED_REGEXP
42#include "src/interpreter-irregexp.h"
43#else  // V8_INTERPRETED_REGEXP
44#include "src/macro-assembler.h"
45#include "src/code.h"
46#if V8_TARGET_ARCH_ARM
47#include "src/arm/assembler-arm.h"
48#include "src/arm/macro-assembler-arm.h"
49#include "src/arm/regexp-macro-assembler-arm.h"
50#endif
51#if V8_TARGET_ARCH_ARM64
52#include "src/arm64/assembler-arm64.h"
53#include "src/arm64/macro-assembler-arm64.h"
54#include "src/arm64/regexp-macro-assembler-arm64.h"
55#endif
56#if V8_TARGET_ARCH_MIPS
57#include "src/mips/assembler-mips.h"
58#include "src/mips/macro-assembler-mips.h"
59#include "src/mips/regexp-macro-assembler-mips.h"
60#endif
61#if V8_TARGET_ARCH_X64
62#include "src/x64/assembler-x64.h"
63#include "src/x64/macro-assembler-x64.h"
64#include "src/x64/regexp-macro-assembler-x64.h"
65#endif
66#if V8_TARGET_ARCH_IA32
67#include "src/ia32/assembler-ia32.h"
68#include "src/ia32/macro-assembler-ia32.h"
69#include "src/ia32/regexp-macro-assembler-ia32.h"
70#endif
71#if V8_TARGET_ARCH_X87
72#include "src/x87/assembler-x87.h"
73#include "src/x87/macro-assembler-x87.h"
74#include "src/x87/regexp-macro-assembler-x87.h"
75#endif
76#endif  // V8_INTERPRETED_REGEXP
77#include "test/cctest/cctest.h"
78
79using namespace v8::internal;
80
81
82static bool CheckParse(const char* input) {
83  V8::Initialize(NULL);
84  v8::HandleScope scope(CcTest::isolate());
85  Zone zone(CcTest::i_isolate());
86  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
87  RegExpCompileData result;
88  return v8::internal::RegExpParser::ParseRegExp(
89      &reader, false, &result, &zone);
90}
91
92
93static SmartArrayPointer<const char> Parse(const char* input) {
94  V8::Initialize(NULL);
95  v8::HandleScope scope(CcTest::isolate());
96  Zone zone(CcTest::i_isolate());
97  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
98  RegExpCompileData result;
99  CHECK(v8::internal::RegExpParser::ParseRegExp(
100      &reader, false, &result, &zone));
101  CHECK(result.tree != NULL);
102  CHECK(result.error.is_null());
103  SmartArrayPointer<const char> output = result.tree->ToString(&zone);
104  return output;
105}
106
107
108static bool CheckSimple(const char* input) {
109  V8::Initialize(NULL);
110  v8::HandleScope scope(CcTest::isolate());
111  Zone zone(CcTest::i_isolate());
112  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
113  RegExpCompileData result;
114  CHECK(v8::internal::RegExpParser::ParseRegExp(
115      &reader, false, &result, &zone));
116  CHECK(result.tree != NULL);
117  CHECK(result.error.is_null());
118  return result.simple;
119}
120
121struct MinMaxPair {
122  int min_match;
123  int max_match;
124};
125
126
127static MinMaxPair CheckMinMaxMatch(const char* input) {
128  V8::Initialize(NULL);
129  v8::HandleScope scope(CcTest::isolate());
130  Zone zone(CcTest::i_isolate());
131  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
132  RegExpCompileData result;
133  CHECK(v8::internal::RegExpParser::ParseRegExp(
134      &reader, false, &result, &zone));
135  CHECK(result.tree != NULL);
136  CHECK(result.error.is_null());
137  int min_match = result.tree->min_match();
138  int max_match = result.tree->max_match();
139  MinMaxPair pair = { min_match, max_match };
140  return pair;
141}
142
143
144#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
145#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, Parse(input).get())
146#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
147#define CHECK_MIN_MAX(input, min, max)                                         \
148  { MinMaxPair min_max = CheckMinMaxMatch(input);                              \
149    CHECK_EQ(min, min_max.min_match);                                          \
150    CHECK_EQ(max, min_max.max_match);                                          \
151  }
152
153TEST(Parser) {
154  V8::Initialize(NULL);
155
156  CHECK_PARSE_ERROR("?");
157
158  CHECK_PARSE_EQ("abc", "'abc'");
159  CHECK_PARSE_EQ("", "%");
160  CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')");
161  CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
162  CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)");
163  CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
164  CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
165  CHECK_PARSE_EQ("a*", "(# 0 - g 'a')");
166  CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')");
167  CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))");
168  CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))");
169  CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))");
170  CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))");
171  CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
172  CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
173  CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
174  CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
175  CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
176  CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
177  CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
178  CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
179  CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
180  CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
181  CHECK_PARSE_EQ("(?:foo)", "'foo'");
182  CHECK_PARSE_EQ("(?: foo )", "' foo '");
183  CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
184  CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
185  CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
186  CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
187  CHECK_PARSE_EQ("()", "(^ %)");
188  CHECK_PARSE_EQ("(?=)", "(-> + %)");
189  CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]");   // Doesn't compile on windows
190  CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]");   // \uffff isn't in codepage 1252
191  CHECK_PARSE_EQ("[x]", "[x]");
192  CHECK_PARSE_EQ("[xyz]", "[x y z]");
193  CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
194  CHECK_PARSE_EQ("[-123]", "[- 1 2 3]");
195  CHECK_PARSE_EQ("[^123]", "^[1 2 3]");
196  CHECK_PARSE_EQ("]", "']'");
197  CHECK_PARSE_EQ("}", "'}'");
198  CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]");
199  CHECK_PARSE_EQ("[\\d]", "[0-9]");
200  CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]");
201  CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
202  CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
203  CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
204  // Control character outside character class.
205  CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
206                 "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
207  CHECK_PARSE_EQ("\\c!", "'\\c!'");
208  CHECK_PARSE_EQ("\\c_", "'\\c_'");
209  CHECK_PARSE_EQ("\\c~", "'\\c~'");
210  CHECK_PARSE_EQ("\\c1", "'\\c1'");
211  // Control character inside character class.
212  CHECK_PARSE_EQ("[\\c!]", "[\\ c !]");
213  CHECK_PARSE_EQ("[\\c_]", "[\\x1f]");
214  CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]");
215  CHECK_PARSE_EQ("[\\ca]", "[\\x01]");
216  CHECK_PARSE_EQ("[\\cz]", "[\\x1a]");
217  CHECK_PARSE_EQ("[\\cA]", "[\\x01]");
218  CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]");
219  CHECK_PARSE_EQ("[\\c1]", "[\\x11]");
220
221  CHECK_PARSE_EQ("[a\\]c]", "[a ] c]");
222  CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
223  CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ #  ]");
224  CHECK_PARSE_EQ("\\0", "'\\x00'");
225  CHECK_PARSE_EQ("\\8", "'8'");
226  CHECK_PARSE_EQ("\\9", "'9'");
227  CHECK_PARSE_EQ("\\11", "'\\x09'");
228  CHECK_PARSE_EQ("\\11a", "'\\x09a'");
229  CHECK_PARSE_EQ("\\011", "'\\x09'");
230  CHECK_PARSE_EQ("\\00011", "'\\x0011'");
231  CHECK_PARSE_EQ("\\118", "'\\x098'");
232  CHECK_PARSE_EQ("\\111", "'I'");
233  CHECK_PARSE_EQ("\\1111", "'I1'");
234  CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
235  CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
236  CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
237  CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
238  CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')"
239                               " (# 0 - g (<- 1)))");
240  CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')"
241                               " (# 0 - g (<- 2)))");
242  CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')"
243                               " (# 0 - g (<- 3)))");
244  CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')"
245                               " (# 0 - g '\\x04'))");
246  CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
247              "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
248              " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
249  CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
250              "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
251              " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
252  CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
253  CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
254  CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
255  CHECK_PARSE_EQ("(?=a)?a", "'a'");
256  CHECK_PARSE_EQ("(?=a){0,10}a", "'a'");
257  CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')");
258  CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
259  CHECK_PARSE_EQ("(?!a)?a", "'a'");
260  CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
261  CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
262  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
263  CHECK_PARSE_EQ("[\\0]", "[\\x00]");
264  CHECK_PARSE_EQ("[\\11]", "[\\x09]");
265  CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
266  CHECK_PARSE_EQ("[\\011]", "[\\x09]");
267  CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]");
268  CHECK_PARSE_EQ("[\\118]", "[\\x09 8]");
269  CHECK_PARSE_EQ("[\\111]", "[I]");
270  CHECK_PARSE_EQ("[\\1111]", "[I 1]");
271  CHECK_PARSE_EQ("\\x34", "'\x34'");
272  CHECK_PARSE_EQ("\\x60", "'\x60'");
273  CHECK_PARSE_EQ("\\x3z", "'x3z'");
274  CHECK_PARSE_EQ("\\c", "'\\c'");
275  CHECK_PARSE_EQ("\\u0034", "'\x34'");
276  CHECK_PARSE_EQ("\\u003z", "'u003z'");
277  CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
278
279  CHECK_SIMPLE("", false);
280  CHECK_SIMPLE("a", true);
281  CHECK_SIMPLE("a|b", false);
282  CHECK_SIMPLE("a\\n", false);
283  CHECK_SIMPLE("^a", false);
284  CHECK_SIMPLE("a$", false);
285  CHECK_SIMPLE("a\\b!", false);
286  CHECK_SIMPLE("a\\Bb", false);
287  CHECK_SIMPLE("a*", false);
288  CHECK_SIMPLE("a*?", false);
289  CHECK_SIMPLE("a?", false);
290  CHECK_SIMPLE("a??", false);
291  CHECK_SIMPLE("a{0,1}?", false);
292  CHECK_SIMPLE("a{1,1}?", false);
293  CHECK_SIMPLE("a{1,2}?", false);
294  CHECK_SIMPLE("a+?", false);
295  CHECK_SIMPLE("(a)", false);
296  CHECK_SIMPLE("(a)\\1", false);
297  CHECK_SIMPLE("(\\1a)", false);
298  CHECK_SIMPLE("\\1(a)", false);
299  CHECK_SIMPLE("a\\s", false);
300  CHECK_SIMPLE("a\\S", false);
301  CHECK_SIMPLE("a\\d", false);
302  CHECK_SIMPLE("a\\D", false);
303  CHECK_SIMPLE("a\\w", false);
304  CHECK_SIMPLE("a\\W", false);
305  CHECK_SIMPLE("a.", false);
306  CHECK_SIMPLE("a\\q", false);
307  CHECK_SIMPLE("a[a]", false);
308  CHECK_SIMPLE("a[^a]", false);
309  CHECK_SIMPLE("a[a-z]", false);
310  CHECK_SIMPLE("a[\\q]", false);
311  CHECK_SIMPLE("a(?:b)", false);
312  CHECK_SIMPLE("a(?=b)", false);
313  CHECK_SIMPLE("a(?!b)", false);
314  CHECK_SIMPLE("\\x60", false);
315  CHECK_SIMPLE("\\u0060", false);
316  CHECK_SIMPLE("\\cA", false);
317  CHECK_SIMPLE("\\q", false);
318  CHECK_SIMPLE("\\1112", false);
319  CHECK_SIMPLE("\\0", false);
320  CHECK_SIMPLE("(a)\\1", false);
321  CHECK_SIMPLE("(?=a)?a", false);
322  CHECK_SIMPLE("(?!a)?a\\1", false);
323  CHECK_SIMPLE("(?:(?=a))a\\1", false);
324
325  CHECK_PARSE_EQ("a{}", "'a{}'");
326  CHECK_PARSE_EQ("a{,}", "'a{,}'");
327  CHECK_PARSE_EQ("a{", "'a{'");
328  CHECK_PARSE_EQ("a{z}", "'a{z}'");
329  CHECK_PARSE_EQ("a{1z}", "'a{1z}'");
330  CHECK_PARSE_EQ("a{12z}", "'a{12z}'");
331  CHECK_PARSE_EQ("a{12,", "'a{12,'");
332  CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'");
333  CHECK_PARSE_EQ("{}", "'{}'");
334  CHECK_PARSE_EQ("{,}", "'{,}'");
335  CHECK_PARSE_EQ("{", "'{'");
336  CHECK_PARSE_EQ("{z}", "'{z}'");
337  CHECK_PARSE_EQ("{1z}", "'{1z}'");
338  CHECK_PARSE_EQ("{12z}", "'{12z}'");
339  CHECK_PARSE_EQ("{12,", "'{12,'");
340  CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
341
342  CHECK_MIN_MAX("a", 1, 1);
343  CHECK_MIN_MAX("abc", 3, 3);
344  CHECK_MIN_MAX("a[bc]d", 3, 3);
345  CHECK_MIN_MAX("a|bc", 1, 2);
346  CHECK_MIN_MAX("ab|c", 1, 2);
347  CHECK_MIN_MAX("a||bc", 0, 2);
348  CHECK_MIN_MAX("|", 0, 0);
349  CHECK_MIN_MAX("(?:ab)", 2, 2);
350  CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
351  CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
352  CHECK_MIN_MAX("(ab)", 2, 2);
353  CHECK_MIN_MAX("(ab|cde)", 2, 3);
354  CHECK_MIN_MAX("(ab)\\1", 2, 4);
355  CHECK_MIN_MAX("(ab|cde)\\1", 2, 6);
356  CHECK_MIN_MAX("(?:ab)?", 0, 2);
357  CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
358  CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
359  CHECK_MIN_MAX("a?", 0, 1);
360  CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
361  CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
362  CHECK_MIN_MAX("a??", 0, 1);
363  CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
364  CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
365  CHECK_MIN_MAX("(?:a?)?", 0, 1);
366  CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
367  CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
368  CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
369  CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
370  CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
371  CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
372  CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
373  CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
374  CHECK_MIN_MAX("a{0}", 0, 0);
375  CHECK_MIN_MAX("(?:a+){0}", 0, 0);
376  CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
377  CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
378  CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
379  CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
380  CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
381  CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
382  CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
383  CHECK_MIN_MAX("a\\bc", 2, 2);
384  CHECK_MIN_MAX("a\\Bc", 2, 2);
385  CHECK_MIN_MAX("a\\sc", 3, 3);
386  CHECK_MIN_MAX("a\\Sc", 3, 3);
387  CHECK_MIN_MAX("a(?=b)c", 2, 2);
388  CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
389  CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
390}
391
392
393TEST(ParserRegression) {
394  CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])");
395  CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
396  CHECK_PARSE_EQ("{", "'{'");
397  CHECK_PARSE_EQ("a|", "(| 'a' %)");
398}
399
400static void ExpectError(const char* input,
401                        const char* expected) {
402  V8::Initialize(NULL);
403  v8::HandleScope scope(CcTest::isolate());
404  Zone zone(CcTest::i_isolate());
405  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
406  RegExpCompileData result;
407  CHECK(!v8::internal::RegExpParser::ParseRegExp(
408      &reader, false, &result, &zone));
409  CHECK(result.tree == NULL);
410  CHECK(!result.error.is_null());
411  SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
412  CHECK_EQ(expected, str.get());
413}
414
415
416TEST(Errors) {
417  const char* kEndBackslash = "\\ at end of pattern";
418  ExpectError("\\", kEndBackslash);
419  const char* kUnterminatedGroup = "Unterminated group";
420  ExpectError("(foo", kUnterminatedGroup);
421  const char* kInvalidGroup = "Invalid group";
422  ExpectError("(?", kInvalidGroup);
423  const char* kUnterminatedCharacterClass = "Unterminated character class";
424  ExpectError("[", kUnterminatedCharacterClass);
425  ExpectError("[a-", kUnterminatedCharacterClass);
426  const char* kNothingToRepeat = "Nothing to repeat";
427  ExpectError("*", kNothingToRepeat);
428  ExpectError("?", kNothingToRepeat);
429  ExpectError("+", kNothingToRepeat);
430  ExpectError("{1}", kNothingToRepeat);
431  ExpectError("{1,2}", kNothingToRepeat);
432  ExpectError("{1,}", kNothingToRepeat);
433
434  // Check that we don't allow more than kMaxCapture captures
435  const int kMaxCaptures = 1 << 16;  // Must match RegExpParser::kMaxCaptures.
436  const char* kTooManyCaptures = "Too many captures";
437  HeapStringAllocator allocator;
438  StringStream accumulator(&allocator);
439  for (int i = 0; i <= kMaxCaptures; i++) {
440    accumulator.Add("()");
441  }
442  SmartArrayPointer<const char> many_captures(accumulator.ToCString());
443  ExpectError(many_captures.get(), kTooManyCaptures);
444}
445
446
447static bool IsDigit(uc16 c) {
448  return ('0' <= c && c <= '9');
449}
450
451
452static bool NotDigit(uc16 c) {
453  return !IsDigit(c);
454}
455
456
457static bool IsWhiteSpaceOrLineTerminator(uc16 c) {
458  // According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
459  // WhiteSpace (7.2) and LineTerminator (7.3) values.
460  return v8::internal::WhiteSpaceOrLineTerminator::Is(c);
461}
462
463
464static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
465  return !IsWhiteSpaceOrLineTerminator(c);
466}
467
468
469static bool NotWord(uc16 c) {
470  return !IsRegExpWord(c);
471}
472
473
474static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
475  Zone zone(CcTest::i_isolate());
476  ZoneList<CharacterRange>* ranges =
477      new(&zone) ZoneList<CharacterRange>(2, &zone);
478  CharacterRange::AddClassEscape(c, ranges, &zone);
479  for (unsigned i = 0; i < (1 << 16); i++) {
480    bool in_class = false;
481    for (int j = 0; !in_class && j < ranges->length(); j++) {
482      CharacterRange& range = ranges->at(j);
483      in_class = (range.from() <= i && i <= range.to());
484    }
485    CHECK_EQ(pred(i), in_class);
486  }
487}
488
489
490TEST(CharacterClassEscapes) {
491  v8::internal::V8::Initialize(NULL);
492  TestCharacterClassEscapes('.', IsRegExpNewline);
493  TestCharacterClassEscapes('d', IsDigit);
494  TestCharacterClassEscapes('D', NotDigit);
495  TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
496  TestCharacterClassEscapes('S', NotWhiteSpaceNorLineTermiantor);
497  TestCharacterClassEscapes('w', IsRegExpWord);
498  TestCharacterClassEscapes('W', NotWord);
499}
500
501
502static RegExpNode* Compile(const char* input,
503                           bool multiline,
504                           bool is_ascii,
505                           Zone* zone) {
506  V8::Initialize(NULL);
507  Isolate* isolate = CcTest::i_isolate();
508  FlatStringReader reader(isolate, CStrVector(input));
509  RegExpCompileData compile_data;
510  if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline,
511                                               &compile_data, zone))
512    return NULL;
513  Handle<String> pattern = isolate->factory()->
514      NewStringFromUtf8(CStrVector(input)).ToHandleChecked();
515  Handle<String> sample_subject =
516      isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
517  RegExpEngine::Compile(&compile_data,
518                        false,
519                        false,
520                        multiline,
521                        pattern,
522                        sample_subject,
523                        is_ascii,
524                        zone);
525  return compile_data.node;
526}
527
528
529static void Execute(const char* input,
530                    bool multiline,
531                    bool is_ascii,
532                    bool dot_output = false) {
533  v8::HandleScope scope(CcTest::isolate());
534  Zone zone(CcTest::i_isolate());
535  RegExpNode* node = Compile(input, multiline, is_ascii, &zone);
536  USE(node);
537#ifdef DEBUG
538  if (dot_output) {
539    RegExpEngine::DotPrint(input, node, false);
540  }
541#endif  // DEBUG
542}
543
544
545class TestConfig {
546 public:
547  typedef int Key;
548  typedef int Value;
549  static const int kNoKey;
550  static int NoValue() { return 0; }
551  static inline int Compare(int a, int b) {
552    if (a < b)
553      return -1;
554    else if (a > b)
555      return 1;
556    else
557      return 0;
558  }
559};
560
561
562const int TestConfig::kNoKey = 0;
563
564
565static unsigned PseudoRandom(int i, int j) {
566  return ~(~((i * 781) ^ (j * 329)));
567}
568
569
570TEST(SplayTreeSimple) {
571  v8::internal::V8::Initialize(NULL);
572  static const unsigned kLimit = 1000;
573  Zone zone(CcTest::i_isolate());
574  ZoneSplayTree<TestConfig> tree(&zone);
575  bool seen[kLimit];
576  for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
577#define CHECK_MAPS_EQUAL() do {                                      \
578    for (unsigned k = 0; k < kLimit; k++)                            \
579      CHECK_EQ(seen[k], tree.Find(k, &loc));                         \
580  } while (false)
581  for (int i = 0; i < 50; i++) {
582    for (int j = 0; j < 50; j++) {
583      unsigned next = PseudoRandom(i, j) % kLimit;
584      if (seen[next]) {
585        // We've already seen this one.  Check the value and remove
586        // it.
587        ZoneSplayTree<TestConfig>::Locator loc;
588        CHECK(tree.Find(next, &loc));
589        CHECK_EQ(next, loc.key());
590        CHECK_EQ(3 * next, loc.value());
591        tree.Remove(next);
592        seen[next] = false;
593        CHECK_MAPS_EQUAL();
594      } else {
595        // Check that it wasn't there already and then add it.
596        ZoneSplayTree<TestConfig>::Locator loc;
597        CHECK(!tree.Find(next, &loc));
598        CHECK(tree.Insert(next, &loc));
599        CHECK_EQ(next, loc.key());
600        loc.set_value(3 * next);
601        seen[next] = true;
602        CHECK_MAPS_EQUAL();
603      }
604      int val = PseudoRandom(j, i) % kLimit;
605      if (seen[val]) {
606        ZoneSplayTree<TestConfig>::Locator loc;
607        CHECK(tree.FindGreatestLessThan(val, &loc));
608        CHECK_EQ(loc.key(), val);
609        break;
610      }
611      val = PseudoRandom(i + j, i - j) % kLimit;
612      if (seen[val]) {
613        ZoneSplayTree<TestConfig>::Locator loc;
614        CHECK(tree.FindLeastGreaterThan(val, &loc));
615        CHECK_EQ(loc.key(), val);
616        break;
617      }
618    }
619  }
620}
621
622
623TEST(DispatchTableConstruction) {
624  v8::internal::V8::Initialize(NULL);
625  // Initialize test data.
626  static const int kLimit = 1000;
627  static const int kRangeCount = 8;
628  static const int kRangeSize = 16;
629  uc16 ranges[kRangeCount][2 * kRangeSize];
630  for (int i = 0; i < kRangeCount; i++) {
631    Vector<uc16> range(ranges[i], 2 * kRangeSize);
632    for (int j = 0; j < 2 * kRangeSize; j++) {
633      range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
634    }
635    range.Sort();
636    for (int j = 1; j < 2 * kRangeSize; j++) {
637      CHECK(range[j-1] <= range[j]);
638    }
639  }
640  // Enter test data into dispatch table.
641  Zone zone(CcTest::i_isolate());
642  DispatchTable table(&zone);
643  for (int i = 0; i < kRangeCount; i++) {
644    uc16* range = ranges[i];
645    for (int j = 0; j < 2 * kRangeSize; j += 2)
646      table.AddRange(CharacterRange(range[j], range[j + 1]), i, &zone);
647  }
648  // Check that the table looks as we would expect
649  for (int p = 0; p < kLimit; p++) {
650    OutSet* outs = table.Get(p);
651    for (int j = 0; j < kRangeCount; j++) {
652      uc16* range = ranges[j];
653      bool is_on = false;
654      for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
655        is_on = (range[k] <= p && p <= range[k + 1]);
656      CHECK_EQ(is_on, outs->Get(j));
657    }
658  }
659}
660
661
662// Test of debug-only syntax.
663#ifdef DEBUG
664
665TEST(ParsePossessiveRepetition) {
666  bool old_flag_value = FLAG_regexp_possessive_quantifier;
667
668  // Enable possessive quantifier syntax.
669  FLAG_regexp_possessive_quantifier = true;
670
671  CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')");
672  CHECK_PARSE_EQ("a++", "(# 1 - p 'a')");
673  CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')");
674  CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')");
675  CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
676
677  // Disable possessive quantifier syntax.
678  FLAG_regexp_possessive_quantifier = false;
679
680  CHECK_PARSE_ERROR("a*+");
681  CHECK_PARSE_ERROR("a++");
682  CHECK_PARSE_ERROR("a?+");
683  CHECK_PARSE_ERROR("a{10,20}+");
684  CHECK_PARSE_ERROR("a{10,20}+b");
685
686  FLAG_regexp_possessive_quantifier = old_flag_value;
687}
688
689#endif
690
691// Tests of interpreter.
692
693
694#ifndef V8_INTERPRETED_REGEXP
695
696#if V8_TARGET_ARCH_IA32
697typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
698#elif V8_TARGET_ARCH_X64
699typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
700#elif V8_TARGET_ARCH_ARM
701typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
702#elif V8_TARGET_ARCH_ARM64
703typedef RegExpMacroAssemblerARM64 ArchRegExpMacroAssembler;
704#elif V8_TARGET_ARCH_MIPS
705typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
706#elif V8_TARGET_ARCH_X87
707typedef RegExpMacroAssemblerX87 ArchRegExpMacroAssembler;
708#endif
709
710class ContextInitializer {
711 public:
712  ContextInitializer()
713      : scope_(CcTest::isolate()),
714        env_(v8::Context::New(CcTest::isolate())) {
715    env_->Enter();
716  }
717  ~ContextInitializer() {
718    env_->Exit();
719  }
720 private:
721  v8::HandleScope scope_;
722  v8::Handle<v8::Context> env_;
723};
724
725
726static ArchRegExpMacroAssembler::Result Execute(Code* code,
727                                                String* input,
728                                                int start_offset,
729                                                const byte* input_start,
730                                                const byte* input_end,
731                                                int* captures) {
732  return NativeRegExpMacroAssembler::Execute(
733      code,
734      input,
735      start_offset,
736      input_start,
737      input_end,
738      captures,
739      0,
740      CcTest::i_isolate());
741}
742
743
744TEST(MacroAssemblerNativeSuccess) {
745  v8::V8::Initialize();
746  ContextInitializer initializer;
747  Isolate* isolate = CcTest::i_isolate();
748  Factory* factory = isolate->factory();
749  Zone zone(isolate);
750
751  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
752
753  m.Succeed();
754
755  Handle<String> source = factory->NewStringFromStaticAscii("");
756  Handle<Object> code_object = m.GetCode(source);
757  Handle<Code> code = Handle<Code>::cast(code_object);
758
759  int captures[4] = {42, 37, 87, 117};
760  Handle<String> input = factory->NewStringFromStaticAscii("foofoo");
761  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
762  const byte* start_adr =
763      reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
764
765  NativeRegExpMacroAssembler::Result result =
766      Execute(*code,
767              *input,
768              0,
769              start_adr,
770              start_adr + seq_input->length(),
771              captures);
772
773  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
774  CHECK_EQ(-1, captures[0]);
775  CHECK_EQ(-1, captures[1]);
776  CHECK_EQ(-1, captures[2]);
777  CHECK_EQ(-1, captures[3]);
778}
779
780
781TEST(MacroAssemblerNativeSimple) {
782  v8::V8::Initialize();
783  ContextInitializer initializer;
784  Isolate* isolate = CcTest::i_isolate();
785  Factory* factory = isolate->factory();
786  Zone zone(isolate);
787
788  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
789
790  Label fail, backtrack;
791  m.PushBacktrack(&fail);
792  m.CheckNotAtStart(NULL);
793  m.LoadCurrentCharacter(2, NULL);
794  m.CheckNotCharacter('o', NULL);
795  m.LoadCurrentCharacter(1, NULL, false);
796  m.CheckNotCharacter('o', NULL);
797  m.LoadCurrentCharacter(0, NULL, false);
798  m.CheckNotCharacter('f', NULL);
799  m.WriteCurrentPositionToRegister(0, 0);
800  m.WriteCurrentPositionToRegister(1, 3);
801  m.AdvanceCurrentPosition(3);
802  m.PushBacktrack(&backtrack);
803  m.Succeed();
804  m.Bind(&backtrack);
805  m.Backtrack();
806  m.Bind(&fail);
807  m.Fail();
808
809  Handle<String> source = factory->NewStringFromStaticAscii("^foo");
810  Handle<Object> code_object = m.GetCode(source);
811  Handle<Code> code = Handle<Code>::cast(code_object);
812
813  int captures[4] = {42, 37, 87, 117};
814  Handle<String> input = factory->NewStringFromStaticAscii("foofoo");
815  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
816  Address start_adr = seq_input->GetCharsAddress();
817
818  NativeRegExpMacroAssembler::Result result =
819      Execute(*code,
820              *input,
821              0,
822              start_adr,
823              start_adr + input->length(),
824              captures);
825
826  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
827  CHECK_EQ(0, captures[0]);
828  CHECK_EQ(3, captures[1]);
829  CHECK_EQ(-1, captures[2]);
830  CHECK_EQ(-1, captures[3]);
831
832  input = factory->NewStringFromStaticAscii("barbarbar");
833  seq_input = Handle<SeqOneByteString>::cast(input);
834  start_adr = seq_input->GetCharsAddress();
835
836  result = Execute(*code,
837                   *input,
838                   0,
839                   start_adr,
840                   start_adr + input->length(),
841                   captures);
842
843  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
844}
845
846
847TEST(MacroAssemblerNativeSimpleUC16) {
848  v8::V8::Initialize();
849  ContextInitializer initializer;
850  Isolate* isolate = CcTest::i_isolate();
851  Factory* factory = isolate->factory();
852  Zone zone(isolate);
853
854  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone);
855
856  Label fail, backtrack;
857  m.PushBacktrack(&fail);
858  m.CheckNotAtStart(NULL);
859  m.LoadCurrentCharacter(2, NULL);
860  m.CheckNotCharacter('o', NULL);
861  m.LoadCurrentCharacter(1, NULL, false);
862  m.CheckNotCharacter('o', NULL);
863  m.LoadCurrentCharacter(0, NULL, false);
864  m.CheckNotCharacter('f', NULL);
865  m.WriteCurrentPositionToRegister(0, 0);
866  m.WriteCurrentPositionToRegister(1, 3);
867  m.AdvanceCurrentPosition(3);
868  m.PushBacktrack(&backtrack);
869  m.Succeed();
870  m.Bind(&backtrack);
871  m.Backtrack();
872  m.Bind(&fail);
873  m.Fail();
874
875  Handle<String> source = factory->NewStringFromStaticAscii("^foo");
876  Handle<Object> code_object = m.GetCode(source);
877  Handle<Code> code = Handle<Code>::cast(code_object);
878
879  int captures[4] = {42, 37, 87, 117};
880  const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o',
881                              static_cast<uc16>(0x2603)};
882  Handle<String> input = factory->NewStringFromTwoByte(
883      Vector<const uc16>(input_data, 6)).ToHandleChecked();
884  Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
885  Address start_adr = seq_input->GetCharsAddress();
886
887  NativeRegExpMacroAssembler::Result result =
888      Execute(*code,
889              *input,
890              0,
891              start_adr,
892              start_adr + input->length(),
893              captures);
894
895  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
896  CHECK_EQ(0, captures[0]);
897  CHECK_EQ(3, captures[1]);
898  CHECK_EQ(-1, captures[2]);
899  CHECK_EQ(-1, captures[3]);
900
901  const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a',
902                               static_cast<uc16>(0x2603)};
903  input = factory->NewStringFromTwoByte(
904      Vector<const uc16>(input_data2, 9)).ToHandleChecked();
905  seq_input = Handle<SeqTwoByteString>::cast(input);
906  start_adr = seq_input->GetCharsAddress();
907
908  result = Execute(*code,
909                   *input,
910                   0,
911                   start_adr,
912                   start_adr + input->length() * 2,
913                   captures);
914
915  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
916}
917
918
919TEST(MacroAssemblerNativeBacktrack) {
920  v8::V8::Initialize();
921  ContextInitializer initializer;
922  Isolate* isolate = CcTest::i_isolate();
923  Factory* factory = isolate->factory();
924  Zone zone(isolate);
925
926  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone);
927
928  Label fail;
929  Label backtrack;
930  m.LoadCurrentCharacter(10, &fail);
931  m.Succeed();
932  m.Bind(&fail);
933  m.PushBacktrack(&backtrack);
934  m.LoadCurrentCharacter(10, NULL);
935  m.Succeed();
936  m.Bind(&backtrack);
937  m.Fail();
938
939  Handle<String> source = factory->NewStringFromStaticAscii("..........");
940  Handle<Object> code_object = m.GetCode(source);
941  Handle<Code> code = Handle<Code>::cast(code_object);
942
943  Handle<String> input = factory->NewStringFromStaticAscii("foofoo");
944  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
945  Address start_adr = seq_input->GetCharsAddress();
946
947  NativeRegExpMacroAssembler::Result result =
948      Execute(*code,
949              *input,
950              0,
951              start_adr,
952              start_adr + input->length(),
953              NULL);
954
955  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
956}
957
958
959TEST(MacroAssemblerNativeBackReferenceASCII) {
960  v8::V8::Initialize();
961  ContextInitializer initializer;
962  Isolate* isolate = CcTest::i_isolate();
963  Factory* factory = isolate->factory();
964  Zone zone(isolate);
965
966  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
967
968  m.WriteCurrentPositionToRegister(0, 0);
969  m.AdvanceCurrentPosition(2);
970  m.WriteCurrentPositionToRegister(1, 0);
971  Label nomatch;
972  m.CheckNotBackReference(0, &nomatch);
973  m.Fail();
974  m.Bind(&nomatch);
975  m.AdvanceCurrentPosition(2);
976  Label missing_match;
977  m.CheckNotBackReference(0, &missing_match);
978  m.WriteCurrentPositionToRegister(2, 0);
979  m.Succeed();
980  m.Bind(&missing_match);
981  m.Fail();
982
983  Handle<String> source = factory->NewStringFromStaticAscii("^(..)..\1");
984  Handle<Object> code_object = m.GetCode(source);
985  Handle<Code> code = Handle<Code>::cast(code_object);
986
987  Handle<String> input = factory->NewStringFromStaticAscii("fooofo");
988  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
989  Address start_adr = seq_input->GetCharsAddress();
990
991  int output[4];
992  NativeRegExpMacroAssembler::Result result =
993      Execute(*code,
994              *input,
995              0,
996              start_adr,
997              start_adr + input->length(),
998              output);
999
1000  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1001  CHECK_EQ(0, output[0]);
1002  CHECK_EQ(2, output[1]);
1003  CHECK_EQ(6, output[2]);
1004  CHECK_EQ(-1, output[3]);
1005}
1006
1007
1008TEST(MacroAssemblerNativeBackReferenceUC16) {
1009  v8::V8::Initialize();
1010  ContextInitializer initializer;
1011  Isolate* isolate = CcTest::i_isolate();
1012  Factory* factory = isolate->factory();
1013  Zone zone(isolate);
1014
1015  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone);
1016
1017  m.WriteCurrentPositionToRegister(0, 0);
1018  m.AdvanceCurrentPosition(2);
1019  m.WriteCurrentPositionToRegister(1, 0);
1020  Label nomatch;
1021  m.CheckNotBackReference(0, &nomatch);
1022  m.Fail();
1023  m.Bind(&nomatch);
1024  m.AdvanceCurrentPosition(2);
1025  Label missing_match;
1026  m.CheckNotBackReference(0, &missing_match);
1027  m.WriteCurrentPositionToRegister(2, 0);
1028  m.Succeed();
1029  m.Bind(&missing_match);
1030  m.Fail();
1031
1032  Handle<String> source = factory->NewStringFromStaticAscii("^(..)..\1");
1033  Handle<Object> code_object = m.GetCode(source);
1034  Handle<Code> code = Handle<Code>::cast(code_object);
1035
1036  const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
1037  Handle<String> input = factory->NewStringFromTwoByte(
1038      Vector<const uc16>(input_data, 6)).ToHandleChecked();
1039  Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
1040  Address start_adr = seq_input->GetCharsAddress();
1041
1042  int output[4];
1043  NativeRegExpMacroAssembler::Result result =
1044      Execute(*code,
1045              *input,
1046              0,
1047              start_adr,
1048              start_adr + input->length() * 2,
1049              output);
1050
1051  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1052  CHECK_EQ(0, output[0]);
1053  CHECK_EQ(2, output[1]);
1054  CHECK_EQ(6, output[2]);
1055  CHECK_EQ(-1, output[3]);
1056}
1057
1058
1059
1060TEST(MacroAssemblernativeAtStart) {
1061  v8::V8::Initialize();
1062  ContextInitializer initializer;
1063  Isolate* isolate = CcTest::i_isolate();
1064  Factory* factory = isolate->factory();
1065  Zone zone(isolate);
1066
1067  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone);
1068
1069  Label not_at_start, newline, fail;
1070  m.CheckNotAtStart(&not_at_start);
1071  // Check that prevchar = '\n' and current = 'f'.
1072  m.CheckCharacter('\n', &newline);
1073  m.Bind(&fail);
1074  m.Fail();
1075  m.Bind(&newline);
1076  m.LoadCurrentCharacter(0, &fail);
1077  m.CheckNotCharacter('f', &fail);
1078  m.Succeed();
1079
1080  m.Bind(&not_at_start);
1081  // Check that prevchar = 'o' and current = 'b'.
1082  Label prevo;
1083  m.CheckCharacter('o', &prevo);
1084  m.Fail();
1085  m.Bind(&prevo);
1086  m.LoadCurrentCharacter(0, &fail);
1087  m.CheckNotCharacter('b', &fail);
1088  m.Succeed();
1089
1090  Handle<String> source = factory->NewStringFromStaticAscii("(^f|ob)");
1091  Handle<Object> code_object = m.GetCode(source);
1092  Handle<Code> code = Handle<Code>::cast(code_object);
1093
1094  Handle<String> input = factory->NewStringFromStaticAscii("foobar");
1095  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1096  Address start_adr = seq_input->GetCharsAddress();
1097
1098  NativeRegExpMacroAssembler::Result result =
1099      Execute(*code,
1100              *input,
1101              0,
1102              start_adr,
1103              start_adr + input->length(),
1104              NULL);
1105
1106  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1107
1108  result = Execute(*code,
1109                   *input,
1110                   3,
1111                   start_adr + 3,
1112                   start_adr + input->length(),
1113                   NULL);
1114
1115  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1116}
1117
1118
1119TEST(MacroAssemblerNativeBackRefNoCase) {
1120  v8::V8::Initialize();
1121  ContextInitializer initializer;
1122  Isolate* isolate = CcTest::i_isolate();
1123  Factory* factory = isolate->factory();
1124  Zone zone(isolate);
1125
1126  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
1127
1128  Label fail, succ;
1129
1130  m.WriteCurrentPositionToRegister(0, 0);
1131  m.WriteCurrentPositionToRegister(2, 0);
1132  m.AdvanceCurrentPosition(3);
1133  m.WriteCurrentPositionToRegister(3, 0);
1134  m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "AbC".
1135  m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "ABC".
1136  Label expected_fail;
1137  m.CheckNotBackReferenceIgnoreCase(2, &expected_fail);
1138  m.Bind(&fail);
1139  m.Fail();
1140
1141  m.Bind(&expected_fail);
1142  m.AdvanceCurrentPosition(3);  // Skip "xYz"
1143  m.CheckNotBackReferenceIgnoreCase(2, &succ);
1144  m.Fail();
1145
1146  m.Bind(&succ);
1147  m.WriteCurrentPositionToRegister(1, 0);
1148  m.Succeed();
1149
1150  Handle<String> source =
1151      factory->NewStringFromStaticAscii("^(abc)\1\1(?!\1)...(?!\1)");
1152  Handle<Object> code_object = m.GetCode(source);
1153  Handle<Code> code = Handle<Code>::cast(code_object);
1154
1155  Handle<String> input =
1156      factory->NewStringFromStaticAscii("aBcAbCABCxYzab");
1157  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1158  Address start_adr = seq_input->GetCharsAddress();
1159
1160  int output[4];
1161  NativeRegExpMacroAssembler::Result result =
1162      Execute(*code,
1163              *input,
1164              0,
1165              start_adr,
1166              start_adr + input->length(),
1167              output);
1168
1169  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1170  CHECK_EQ(0, output[0]);
1171  CHECK_EQ(12, output[1]);
1172  CHECK_EQ(0, output[2]);
1173  CHECK_EQ(3, output[3]);
1174}
1175
1176
1177
1178TEST(MacroAssemblerNativeRegisters) {
1179  v8::V8::Initialize();
1180  ContextInitializer initializer;
1181  Isolate* isolate = CcTest::i_isolate();
1182  Factory* factory = isolate->factory();
1183  Zone zone(isolate);
1184
1185  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6, &zone);
1186
1187  uc16 foo_chars[3] = {'f', 'o', 'o'};
1188  Vector<const uc16> foo(foo_chars, 3);
1189
1190  enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1191  Label fail;
1192  Label backtrack;
1193  m.WriteCurrentPositionToRegister(out1, 0);  // Output: [0]
1194  m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1195  m.PushBacktrack(&backtrack);
1196  m.WriteStackPointerToRegister(sp);
1197  // Fill stack and registers
1198  m.AdvanceCurrentPosition(2);
1199  m.WriteCurrentPositionToRegister(out1, 0);
1200  m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1201  m.PushBacktrack(&fail);
1202  // Drop backtrack stack frames.
1203  m.ReadStackPointerFromRegister(sp);
1204  // And take the first backtrack (to &backtrack)
1205  m.Backtrack();
1206
1207  m.PushCurrentPosition();
1208  m.AdvanceCurrentPosition(2);
1209  m.PopCurrentPosition();
1210
1211  m.Bind(&backtrack);
1212  m.PopRegister(out1);
1213  m.ReadCurrentPositionFromRegister(out1);
1214  m.AdvanceCurrentPosition(3);
1215  m.WriteCurrentPositionToRegister(out2, 0);  // [0,3]
1216
1217  Label loop;
1218  m.SetRegister(loop_cnt, 0);  // loop counter
1219  m.Bind(&loop);
1220  m.AdvanceRegister(loop_cnt, 1);
1221  m.AdvanceCurrentPosition(1);
1222  m.IfRegisterLT(loop_cnt, 3, &loop);
1223  m.WriteCurrentPositionToRegister(out3, 0);  // [0,3,6]
1224
1225  Label loop2;
1226  m.SetRegister(loop_cnt, 2);  // loop counter
1227  m.Bind(&loop2);
1228  m.AdvanceRegister(loop_cnt, -1);
1229  m.AdvanceCurrentPosition(1);
1230  m.IfRegisterGE(loop_cnt, 0, &loop2);
1231  m.WriteCurrentPositionToRegister(out4, 0);  // [0,3,6,9]
1232
1233  Label loop3;
1234  Label exit_loop3;
1235  m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1236  m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1237  m.ReadCurrentPositionFromRegister(out3);
1238  m.Bind(&loop3);
1239  m.AdvanceCurrentPosition(1);
1240  m.CheckGreedyLoop(&exit_loop3);
1241  m.GoTo(&loop3);
1242  m.Bind(&exit_loop3);
1243  m.PopCurrentPosition();
1244  m.WriteCurrentPositionToRegister(out5, 0);  // [0,3,6,9,9,-1]
1245
1246  m.Succeed();
1247
1248  m.Bind(&fail);
1249  m.Fail();
1250
1251  Handle<String> source =
1252      factory->NewStringFromStaticAscii("<loop test>");
1253  Handle<Object> code_object = m.GetCode(source);
1254  Handle<Code> code = Handle<Code>::cast(code_object);
1255
1256  // String long enough for test (content doesn't matter).
1257  Handle<String> input =
1258      factory->NewStringFromStaticAscii("foofoofoofoofoo");
1259  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1260  Address start_adr = seq_input->GetCharsAddress();
1261
1262  int output[6];
1263  NativeRegExpMacroAssembler::Result result =
1264      Execute(*code,
1265              *input,
1266              0,
1267              start_adr,
1268              start_adr + input->length(),
1269              output);
1270
1271  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1272  CHECK_EQ(0, output[0]);
1273  CHECK_EQ(3, output[1]);
1274  CHECK_EQ(6, output[2]);
1275  CHECK_EQ(9, output[3]);
1276  CHECK_EQ(9, output[4]);
1277  CHECK_EQ(-1, output[5]);
1278}
1279
1280
1281TEST(MacroAssemblerStackOverflow) {
1282  v8::V8::Initialize();
1283  ContextInitializer initializer;
1284  Isolate* isolate = CcTest::i_isolate();
1285  Factory* factory = isolate->factory();
1286  Zone zone(isolate);
1287
1288  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone);
1289
1290  Label loop;
1291  m.Bind(&loop);
1292  m.PushBacktrack(&loop);
1293  m.GoTo(&loop);
1294
1295  Handle<String> source =
1296      factory->NewStringFromStaticAscii("<stack overflow test>");
1297  Handle<Object> code_object = m.GetCode(source);
1298  Handle<Code> code = Handle<Code>::cast(code_object);
1299
1300  // String long enough for test (content doesn't matter).
1301  Handle<String> input =
1302      factory->NewStringFromStaticAscii("dummy");
1303  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1304  Address start_adr = seq_input->GetCharsAddress();
1305
1306  NativeRegExpMacroAssembler::Result result =
1307      Execute(*code,
1308              *input,
1309              0,
1310              start_adr,
1311              start_adr + input->length(),
1312              NULL);
1313
1314  CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1315  CHECK(isolate->has_pending_exception());
1316  isolate->clear_pending_exception();
1317}
1318
1319
1320TEST(MacroAssemblerNativeLotsOfRegisters) {
1321  v8::V8::Initialize();
1322  ContextInitializer initializer;
1323  Isolate* isolate = CcTest::i_isolate();
1324  Factory* factory = isolate->factory();
1325  Zone zone(isolate);
1326
1327  ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2, &zone);
1328
1329  // At least 2048, to ensure the allocated space for registers
1330  // span one full page.
1331  const int large_number = 8000;
1332  m.WriteCurrentPositionToRegister(large_number, 42);
1333  m.WriteCurrentPositionToRegister(0, 0);
1334  m.WriteCurrentPositionToRegister(1, 1);
1335  Label done;
1336  m.CheckNotBackReference(0, &done);  // Performs a system-stack push.
1337  m.Bind(&done);
1338  m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
1339  m.PopRegister(1);
1340  m.Succeed();
1341
1342  Handle<String> source =
1343      factory->NewStringFromStaticAscii("<huge register space test>");
1344  Handle<Object> code_object = m.GetCode(source);
1345  Handle<Code> code = Handle<Code>::cast(code_object);
1346
1347  // String long enough for test (content doesn't matter).
1348  Handle<String> input =
1349      factory->NewStringFromStaticAscii("sample text");
1350  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1351  Address start_adr = seq_input->GetCharsAddress();
1352
1353  int captures[2];
1354  NativeRegExpMacroAssembler::Result result =
1355      Execute(*code,
1356              *input,
1357              0,
1358              start_adr,
1359              start_adr + input->length(),
1360              captures);
1361
1362  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1363  CHECK_EQ(0, captures[0]);
1364  CHECK_EQ(42, captures[1]);
1365
1366  isolate->clear_pending_exception();
1367}
1368
1369#else  // V8_INTERPRETED_REGEXP
1370
1371TEST(MacroAssembler) {
1372  V8::Initialize(NULL);
1373  byte codes[1024];
1374  Zone zone(CcTest::i_isolate());
1375  RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024), &zone);
1376  // ^f(o)o.
1377  Label start, fail, backtrack;
1378
1379  m.SetRegister(4, 42);
1380  m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
1381  m.AdvanceRegister(4, 42);
1382  m.GoTo(&start);
1383  m.Fail();
1384  m.Bind(&start);
1385  m.PushBacktrack(&fail);
1386  m.CheckNotAtStart(NULL);
1387  m.LoadCurrentCharacter(0, NULL);
1388  m.CheckNotCharacter('f', NULL);
1389  m.LoadCurrentCharacter(1, NULL);
1390  m.CheckNotCharacter('o', NULL);
1391  m.LoadCurrentCharacter(2, NULL);
1392  m.CheckNotCharacter('o', NULL);
1393  m.WriteCurrentPositionToRegister(0, 0);
1394  m.WriteCurrentPositionToRegister(1, 3);
1395  m.WriteCurrentPositionToRegister(2, 1);
1396  m.WriteCurrentPositionToRegister(3, 2);
1397  m.AdvanceCurrentPosition(3);
1398  m.PushBacktrack(&backtrack);
1399  m.Succeed();
1400  m.Bind(&backtrack);
1401  m.ClearRegisters(2, 3);
1402  m.Backtrack();
1403  m.Bind(&fail);
1404  m.PopRegister(0);
1405  m.Fail();
1406
1407  Isolate* isolate = CcTest::i_isolate();
1408  Factory* factory = isolate->factory();
1409  HandleScope scope(isolate);
1410
1411  Handle<String> source = factory->NewStringFromStaticAscii("^f(o)o");
1412  Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
1413  int captures[5];
1414
1415  const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1416  Handle<String> f1_16 = factory->NewStringFromTwoByte(
1417      Vector<const uc16>(str1, 6)).ToHandleChecked();
1418
1419  CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0));
1420  CHECK_EQ(0, captures[0]);
1421  CHECK_EQ(3, captures[1]);
1422  CHECK_EQ(1, captures[2]);
1423  CHECK_EQ(2, captures[3]);
1424  CHECK_EQ(84, captures[4]);
1425
1426  const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1427  Handle<String> f2_16 = factory->NewStringFromTwoByte(
1428      Vector<const uc16>(str2, 6)).ToHandleChecked();
1429
1430  CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0));
1431  CHECK_EQ(42, captures[0]);
1432}
1433
1434#endif  // V8_INTERPRETED_REGEXP
1435
1436
1437TEST(AddInverseToTable) {
1438  v8::internal::V8::Initialize(NULL);
1439  static const int kLimit = 1000;
1440  static const int kRangeCount = 16;
1441  for (int t = 0; t < 10; t++) {
1442    Zone zone(CcTest::i_isolate());
1443    ZoneList<CharacterRange>* ranges =
1444        new(&zone) ZoneList<CharacterRange>(kRangeCount, &zone);
1445    for (int i = 0; i < kRangeCount; i++) {
1446      int from = PseudoRandom(t + 87, i + 25) % kLimit;
1447      int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
1448      if (to > kLimit) to = kLimit;
1449      ranges->Add(CharacterRange(from, to), &zone);
1450    }
1451    DispatchTable table(&zone);
1452    DispatchTableConstructor cons(&table, false, &zone);
1453    cons.set_choice_index(0);
1454    cons.AddInverse(ranges);
1455    for (int i = 0; i < kLimit; i++) {
1456      bool is_on = false;
1457      for (int j = 0; !is_on && j < kRangeCount; j++)
1458        is_on = ranges->at(j).Contains(i);
1459      OutSet* set = table.Get(i);
1460      CHECK_EQ(is_on, set->Get(0) == false);
1461    }
1462  }
1463  Zone zone(CcTest::i_isolate());
1464  ZoneList<CharacterRange>* ranges =
1465      new(&zone) ZoneList<CharacterRange>(1, &zone);
1466  ranges->Add(CharacterRange(0xFFF0, 0xFFFE), &zone);
1467  DispatchTable table(&zone);
1468  DispatchTableConstructor cons(&table, false, &zone);
1469  cons.set_choice_index(0);
1470  cons.AddInverse(ranges);
1471  CHECK(!table.Get(0xFFFE)->Get(0));
1472  CHECK(table.Get(0xFFFF)->Get(0));
1473}
1474
1475
1476static uc32 canonicalize(uc32 c) {
1477  unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
1478  int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
1479  if (count == 0) {
1480    return c;
1481  } else {
1482    CHECK_EQ(1, count);
1483    return canon[0];
1484  }
1485}
1486
1487
1488TEST(LatinCanonicalize) {
1489  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1490  for (char lower = 'a'; lower <= 'z'; lower++) {
1491    char upper = lower + ('A' - 'a');
1492    CHECK_EQ(canonicalize(lower), canonicalize(upper));
1493    unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1494    int length = un_canonicalize.get(lower, '\0', uncanon);
1495    CHECK_EQ(2, length);
1496    CHECK_EQ(upper, uncanon[0]);
1497    CHECK_EQ(lower, uncanon[1]);
1498  }
1499  for (uc32 c = 128; c < (1 << 21); c++)
1500    CHECK_GE(canonicalize(c), 128);
1501  unibrow::Mapping<unibrow::ToUppercase> to_upper;
1502  // Canonicalization is only defined for the Basic Multilingual Plane.
1503  for (uc32 c = 0; c < (1 << 16); c++) {
1504    unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
1505    int length = to_upper.get(c, '\0', upper);
1506    if (length == 0) {
1507      length = 1;
1508      upper[0] = c;
1509    }
1510    uc32 u = upper[0];
1511    if (length > 1 || (c >= 128 && u < 128))
1512      u = c;
1513    CHECK_EQ(u, canonicalize(c));
1514  }
1515}
1516
1517
1518static uc32 CanonRangeEnd(uc32 c) {
1519  unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
1520  int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
1521  if (count == 0) {
1522    return c;
1523  } else {
1524    CHECK_EQ(1, count);
1525    return canon[0];
1526  }
1527}
1528
1529
1530TEST(RangeCanonicalization) {
1531  // Check that we arrive at the same result when using the basic
1532  // range canonicalization primitives as when using immediate
1533  // canonicalization.
1534  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1535  int block_start = 0;
1536  while (block_start <= 0xFFFF) {
1537    uc32 block_end = CanonRangeEnd(block_start);
1538    unsigned block_length = block_end - block_start + 1;
1539    if (block_length > 1) {
1540      unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1541      int first_length = un_canonicalize.get(block_start, '\0', first);
1542      for (unsigned i = 1; i < block_length; i++) {
1543        unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1544        int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
1545        CHECK_EQ(first_length, succ_length);
1546        for (int j = 0; j < succ_length; j++) {
1547          int calc = first[j] + i;
1548          int found = succ[j];
1549          CHECK_EQ(calc, found);
1550        }
1551      }
1552    }
1553    block_start = block_start + block_length;
1554  }
1555}
1556
1557
1558TEST(UncanonicalizeEquivalence) {
1559  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1560  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1561  for (int i = 0; i < (1 << 16); i++) {
1562    int length = un_canonicalize.get(i, '\0', chars);
1563    for (int j = 0; j < length; j++) {
1564      unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1565      int length2 = un_canonicalize.get(chars[j], '\0', chars2);
1566      CHECK_EQ(length, length2);
1567      for (int k = 0; k < length; k++)
1568        CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
1569    }
1570  }
1571}
1572
1573
1574static void TestRangeCaseIndependence(CharacterRange input,
1575                                      Vector<CharacterRange> expected) {
1576  Zone zone(CcTest::i_isolate());
1577  int count = expected.length();
1578  ZoneList<CharacterRange>* list =
1579      new(&zone) ZoneList<CharacterRange>(count, &zone);
1580  input.AddCaseEquivalents(list, false, &zone);
1581  CHECK_EQ(count, list->length());
1582  for (int i = 0; i < list->length(); i++) {
1583    CHECK_EQ(expected[i].from(), list->at(i).from());
1584    CHECK_EQ(expected[i].to(), list->at(i).to());
1585  }
1586}
1587
1588
1589static void TestSimpleRangeCaseIndependence(CharacterRange input,
1590                                            CharacterRange expected) {
1591  EmbeddedVector<CharacterRange, 1> vector;
1592  vector[0] = expected;
1593  TestRangeCaseIndependence(input, vector);
1594}
1595
1596
1597TEST(CharacterRangeCaseIndependence) {
1598  v8::internal::V8::Initialize(NULL);
1599  TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'),
1600                                  CharacterRange::Singleton('A'));
1601  TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'),
1602                                  CharacterRange::Singleton('Z'));
1603  TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'),
1604                                  CharacterRange('A', 'Z'));
1605  TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'),
1606                                  CharacterRange('C', 'F'));
1607  TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'),
1608                                  CharacterRange('A', 'B'));
1609  TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'),
1610                                  CharacterRange('Y', 'Z'));
1611  TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1),
1612                                  CharacterRange('A', 'Z'));
1613  TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'),
1614                                  CharacterRange('a', 'z'));
1615  TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'),
1616                                  CharacterRange('c', 'f'));
1617  TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1),
1618                                  CharacterRange('a', 'z'));
1619  // Here we need to add [l-z] to complete the case independence of
1620  // [A-Za-z] but we expect [a-z] to be added since we always add a
1621  // whole block at a time.
1622  TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'),
1623                                  CharacterRange('a', 'z'));
1624}
1625
1626
1627static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
1628  if (ranges == NULL)
1629    return false;
1630  for (int i = 0; i < ranges->length(); i++) {
1631    CharacterRange range = ranges->at(i);
1632    if (range.from() <= c && c <= range.to())
1633      return true;
1634  }
1635  return false;
1636}
1637
1638
1639TEST(CharClassDifference) {
1640  v8::internal::V8::Initialize(NULL);
1641  Zone zone(CcTest::i_isolate());
1642  ZoneList<CharacterRange>* base =
1643      new(&zone) ZoneList<CharacterRange>(1, &zone);
1644  base->Add(CharacterRange::Everything(), &zone);
1645  Vector<const int> overlay = CharacterRange::GetWordBounds();
1646  ZoneList<CharacterRange>* included = NULL;
1647  ZoneList<CharacterRange>* excluded = NULL;
1648  CharacterRange::Split(base, overlay, &included, &excluded, &zone);
1649  for (int i = 0; i < (1 << 16); i++) {
1650    bool in_base = InClass(i, base);
1651    if (in_base) {
1652      bool in_overlay = false;
1653      for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
1654        if (overlay[j] <= i && i < overlay[j+1])
1655          in_overlay = true;
1656      }
1657      CHECK_EQ(in_overlay, InClass(i, included));
1658      CHECK_EQ(!in_overlay, InClass(i, excluded));
1659    } else {
1660      CHECK(!InClass(i, included));
1661      CHECK(!InClass(i, excluded));
1662    }
1663  }
1664}
1665
1666
1667TEST(CanonicalizeCharacterSets) {
1668  v8::internal::V8::Initialize(NULL);
1669  Zone zone(CcTest::i_isolate());
1670  ZoneList<CharacterRange>* list =
1671      new(&zone) ZoneList<CharacterRange>(4, &zone);
1672  CharacterSet set(list);
1673
1674  list->Add(CharacterRange(10, 20), &zone);
1675  list->Add(CharacterRange(30, 40), &zone);
1676  list->Add(CharacterRange(50, 60), &zone);
1677  set.Canonicalize();
1678  ASSERT_EQ(3, list->length());
1679  ASSERT_EQ(10, list->at(0).from());
1680  ASSERT_EQ(20, list->at(0).to());
1681  ASSERT_EQ(30, list->at(1).from());
1682  ASSERT_EQ(40, list->at(1).to());
1683  ASSERT_EQ(50, list->at(2).from());
1684  ASSERT_EQ(60, list->at(2).to());
1685
1686  list->Rewind(0);
1687  list->Add(CharacterRange(10, 20), &zone);
1688  list->Add(CharacterRange(50, 60), &zone);
1689  list->Add(CharacterRange(30, 40), &zone);
1690  set.Canonicalize();
1691  ASSERT_EQ(3, list->length());
1692  ASSERT_EQ(10, list->at(0).from());
1693  ASSERT_EQ(20, list->at(0).to());
1694  ASSERT_EQ(30, list->at(1).from());
1695  ASSERT_EQ(40, list->at(1).to());
1696  ASSERT_EQ(50, list->at(2).from());
1697  ASSERT_EQ(60, list->at(2).to());
1698
1699  list->Rewind(0);
1700  list->Add(CharacterRange(30, 40), &zone);
1701  list->Add(CharacterRange(10, 20), &zone);
1702  list->Add(CharacterRange(25, 25), &zone);
1703  list->Add(CharacterRange(100, 100), &zone);
1704  list->Add(CharacterRange(1, 1), &zone);
1705  set.Canonicalize();
1706  ASSERT_EQ(5, list->length());
1707  ASSERT_EQ(1, list->at(0).from());
1708  ASSERT_EQ(1, list->at(0).to());
1709  ASSERT_EQ(10, list->at(1).from());
1710  ASSERT_EQ(20, list->at(1).to());
1711  ASSERT_EQ(25, list->at(2).from());
1712  ASSERT_EQ(25, list->at(2).to());
1713  ASSERT_EQ(30, list->at(3).from());
1714  ASSERT_EQ(40, list->at(3).to());
1715  ASSERT_EQ(100, list->at(4).from());
1716  ASSERT_EQ(100, list->at(4).to());
1717
1718  list->Rewind(0);
1719  list->Add(CharacterRange(10, 19), &zone);
1720  list->Add(CharacterRange(21, 30), &zone);
1721  list->Add(CharacterRange(20, 20), &zone);
1722  set.Canonicalize();
1723  ASSERT_EQ(1, list->length());
1724  ASSERT_EQ(10, list->at(0).from());
1725  ASSERT_EQ(30, list->at(0).to());
1726}
1727
1728
1729TEST(CharacterRangeMerge) {
1730  v8::internal::V8::Initialize(NULL);
1731  Zone zone(CcTest::i_isolate());
1732  ZoneList<CharacterRange> l1(4, &zone);
1733  ZoneList<CharacterRange> l2(4, &zone);
1734  // Create all combinations of intersections of ranges, both singletons and
1735  // longer.
1736
1737  int offset = 0;
1738
1739  // The five kinds of singleton intersections:
1740  //     X
1741  //   Y      - outside before
1742  //    Y     - outside touching start
1743  //     Y    - overlap
1744  //      Y   - outside touching end
1745  //       Y  - outside after
1746
1747  for (int i = 0; i < 5; i++) {
1748    l1.Add(CharacterRange::Singleton(offset + 2), &zone);
1749    l2.Add(CharacterRange::Singleton(offset + i), &zone);
1750    offset += 6;
1751  }
1752
1753  // The seven kinds of singleton/non-singleton intersections:
1754  //    XXX
1755  //  Y        - outside before
1756  //   Y       - outside touching start
1757  //    Y      - inside touching start
1758  //     Y     - entirely inside
1759  //      Y    - inside touching end
1760  //       Y   - outside touching end
1761  //        Y  - disjoint after
1762
1763  for (int i = 0; i < 7; i++) {
1764    l1.Add(CharacterRange::Range(offset + 2, offset + 4), &zone);
1765    l2.Add(CharacterRange::Singleton(offset + i), &zone);
1766    offset += 8;
1767  }
1768
1769  // The eleven kinds of non-singleton intersections:
1770  //
1771  //       XXXXXXXX
1772  // YYYY                  - outside before.
1773  //   YYYY                - outside touching start.
1774  //     YYYY              - overlapping start
1775  //       YYYY            - inside touching start
1776  //         YYYY          - entirely inside
1777  //           YYYY        - inside touching end
1778  //             YYYY      - overlapping end
1779  //               YYYY    - outside touching end
1780  //                 YYYY  - outside after
1781  //       YYYYYYYY        - identical
1782  //     YYYYYYYYYYYY      - containing entirely.
1783
1784  for (int i = 0; i < 9; i++) {
1785    l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);  // Length 8.
1786    l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3), &zone);
1787    offset += 22;
1788  }
1789  l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1790  l2.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1791  offset += 22;
1792  l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1793  l2.Add(CharacterRange::Range(offset + 4, offset + 17), &zone);
1794  offset += 22;
1795
1796  // Different kinds of multi-range overlap:
1797  // XXXXXXXXXXXXXXXXXXXXXX         XXXXXXXXXXXXXXXXXXXXXX
1798  //   YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y
1799
1800  l1.Add(CharacterRange::Range(offset, offset + 21), &zone);
1801  l1.Add(CharacterRange::Range(offset + 31, offset + 52), &zone);
1802  for (int i = 0; i < 6; i++) {
1803    l2.Add(CharacterRange::Range(offset + 2, offset + 5), &zone);
1804    l2.Add(CharacterRange::Singleton(offset + 8), &zone);
1805    offset += 9;
1806  }
1807
1808  ASSERT(CharacterRange::IsCanonical(&l1));
1809  ASSERT(CharacterRange::IsCanonical(&l2));
1810
1811  ZoneList<CharacterRange> first_only(4, &zone);
1812  ZoneList<CharacterRange> second_only(4, &zone);
1813  ZoneList<CharacterRange> both(4, &zone);
1814}
1815
1816
1817TEST(Graph) {
1818  V8::Initialize(NULL);
1819  Execute("\\b\\w+\\b", false, true, true);
1820}
1821