1// Copyright 2012 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include <cstdlib>
29#include <sstream>
30
31#include "include/v8.h"
32#include "src/v8.h"
33
34#include "src/ast/ast.h"
35#include "src/char-predicates-inl.h"
36#include "src/ostreams.h"
37#include "src/regexp/jsregexp.h"
38#include "src/regexp/regexp-macro-assembler.h"
39#include "src/regexp/regexp-macro-assembler-irregexp.h"
40#include "src/regexp/regexp-parser.h"
41#include "src/splay-tree-inl.h"
42#include "src/string-stream.h"
43#ifdef V8_INTERPRETED_REGEXP
44#include "src/regexp/interpreter-irregexp.h"
45#else  // V8_INTERPRETED_REGEXP
46#include "src/macro-assembler.h"
47#if V8_TARGET_ARCH_ARM
48#include "src/arm/assembler-arm.h"  // NOLINT
49#include "src/arm/macro-assembler-arm.h"
50#include "src/regexp/arm/regexp-macro-assembler-arm.h"
51#endif
52#if V8_TARGET_ARCH_ARM64
53#include "src/arm64/assembler-arm64.h"
54#include "src/arm64/macro-assembler-arm64.h"
55#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
56#endif
57#if V8_TARGET_ARCH_S390
58#include "src/regexp/s390/regexp-macro-assembler-s390.h"
59#include "src/s390/assembler-s390.h"
60#include "src/s390/macro-assembler-s390.h"
61#endif
62#if V8_TARGET_ARCH_PPC
63#include "src/ppc/assembler-ppc.h"
64#include "src/ppc/macro-assembler-ppc.h"
65#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
66#endif
67#if V8_TARGET_ARCH_MIPS
68#include "src/mips/assembler-mips.h"
69#include "src/mips/macro-assembler-mips.h"
70#include "src/regexp/mips/regexp-macro-assembler-mips.h"
71#endif
72#if V8_TARGET_ARCH_MIPS64
73#include "src/mips64/assembler-mips64.h"
74#include "src/mips64/macro-assembler-mips64.h"
75#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
76#endif
77#if V8_TARGET_ARCH_X64
78#include "src/regexp/x64/regexp-macro-assembler-x64.h"
79#include "src/x64/assembler-x64.h"
80#include "src/x64/macro-assembler-x64.h"
81#endif
82#if V8_TARGET_ARCH_IA32
83#include "src/ia32/assembler-ia32.h"
84#include "src/ia32/macro-assembler-ia32.h"
85#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
86#endif
87#if V8_TARGET_ARCH_X87
88#include "src/regexp/x87/regexp-macro-assembler-x87.h"
89#include "src/x87/assembler-x87.h"
90#include "src/x87/macro-assembler-x87.h"
91#endif
92#endif  // V8_INTERPRETED_REGEXP
93#include "test/cctest/cctest.h"
94
95using namespace v8::internal;
96
97
98static bool CheckParse(const char* input) {
99  v8::HandleScope scope(CcTest::isolate());
100  Zone zone(CcTest::i_isolate()->allocator());
101  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
102  RegExpCompileData result;
103  return v8::internal::RegExpParser::ParseRegExp(
104      CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result);
105}
106
107
108static void CheckParseEq(const char* input, const char* expected,
109                         bool unicode = false) {
110  v8::HandleScope scope(CcTest::isolate());
111  Zone zone(CcTest::i_isolate()->allocator());
112  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
113  RegExpCompileData result;
114  JSRegExp::Flags flags = JSRegExp::kNone;
115  if (unicode) flags |= JSRegExp::kUnicode;
116  CHECK(v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), &zone,
117                                                &reader, flags, &result));
118  CHECK(result.tree != NULL);
119  CHECK(result.error.is_null());
120  std::ostringstream os;
121  result.tree->Print(os, &zone);
122  if (strcmp(expected, os.str().c_str()) != 0) {
123    printf("%s | %s\n", expected, os.str().c_str());
124  }
125  CHECK_EQ(0, strcmp(expected, os.str().c_str()));
126}
127
128
129static bool CheckSimple(const char* input) {
130  v8::HandleScope scope(CcTest::isolate());
131  Zone zone(CcTest::i_isolate()->allocator());
132  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
133  RegExpCompileData result;
134  CHECK(v8::internal::RegExpParser::ParseRegExp(
135      CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
136  CHECK(result.tree != NULL);
137  CHECK(result.error.is_null());
138  return result.simple;
139}
140
141struct MinMaxPair {
142  int min_match;
143  int max_match;
144};
145
146
147static MinMaxPair CheckMinMaxMatch(const char* input) {
148  v8::HandleScope scope(CcTest::isolate());
149  Zone zone(CcTest::i_isolate()->allocator());
150  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
151  RegExpCompileData result;
152  CHECK(v8::internal::RegExpParser::ParseRegExp(
153      CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
154  CHECK(result.tree != NULL);
155  CHECK(result.error.is_null());
156  int min_match = result.tree->min_match();
157  int max_match = result.tree->max_match();
158  MinMaxPair pair = { min_match, max_match };
159  return pair;
160}
161
162
163#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
164#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
165#define CHECK_MIN_MAX(input, min, max)                                         \
166  { MinMaxPair min_max = CheckMinMaxMatch(input);                              \
167    CHECK_EQ(min, min_max.min_match);                                          \
168    CHECK_EQ(max, min_max.max_match);                                          \
169  }
170
171
172void TestRegExpParser(bool lookbehind) {
173  FLAG_harmony_regexp_lookbehind = lookbehind;
174
175  CHECK_PARSE_ERROR("?");
176
177  CheckParseEq("abc", "'abc'");
178  CheckParseEq("", "%");
179  CheckParseEq("abc|def", "(| 'abc' 'def')");
180  CheckParseEq("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
181  CheckParseEq("^xxx$", "(: @^i 'xxx' @$i)");
182  CheckParseEq("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
183  CheckParseEq("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
184  CheckParseEq("a*", "(# 0 - g 'a')");
185  CheckParseEq("a*?", "(# 0 - n 'a')");
186  CheckParseEq("abc+", "(: 'ab' (# 1 - g 'c'))");
187  CheckParseEq("abc+?", "(: 'ab' (# 1 - n 'c'))");
188  CheckParseEq("xyz?", "(: 'xy' (# 0 1 g 'z'))");
189  CheckParseEq("xyz??", "(: 'xy' (# 0 1 n 'z'))");
190  CheckParseEq("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
191  CheckParseEq("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
192  CheckParseEq("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
193  CheckParseEq("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
194  CheckParseEq("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
195  CheckParseEq("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
196  CheckParseEq("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
197  CheckParseEq("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
198  CheckParseEq("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
199  CheckParseEq("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
200  CheckParseEq("(?:foo)", "'foo'");
201  CheckParseEq("(?: foo )", "' foo '");
202  CheckParseEq("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
203  CheckParseEq("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
204  CheckParseEq("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
205  CheckParseEq("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
206  if (lookbehind) {
207    CheckParseEq("foo(?<=bar)baz", "(: 'foo' (<- + 'bar') 'baz')");
208    CheckParseEq("foo(?<!bar)baz", "(: 'foo' (<- - 'bar') 'baz')");
209  } else {
210    CHECK_PARSE_ERROR("foo(?<=bar)baz");
211    CHECK_PARSE_ERROR("foo(?<!bar)baz");
212  }
213  CheckParseEq("()", "(^ %)");
214  CheckParseEq("(?=)", "(-> + %)");
215  CheckParseEq("[]", "^[\\x00-\\u{10ffff}]");  // Doesn't compile on windows
216  CheckParseEq("[^]", "[\\x00-\\u{10ffff}]");  // \uffff isn't in codepage 1252
217  CheckParseEq("[x]", "[x]");
218  CheckParseEq("[xyz]", "[x y z]");
219  CheckParseEq("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
220  CheckParseEq("[-123]", "[- 1 2 3]");
221  CheckParseEq("[^123]", "^[1 2 3]");
222  CheckParseEq("]", "']'");
223  CheckParseEq("}", "'}'");
224  CheckParseEq("[a-b-c]", "[a-b - c]");
225  CheckParseEq("[\\d]", "[0-9]");
226  CheckParseEq("[x\\dz]", "[x 0-9 z]");
227  CheckParseEq("[\\d-z]", "[0-9 - z]");
228  CheckParseEq("[\\d-\\d]", "[0-9 - 0-9]");
229  CheckParseEq("[z-\\d]", "[z - 0-9]");
230  // Control character outside character class.
231  CheckParseEq("\\cj\\cJ\\ci\\cI\\ck\\cK", "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
232  CheckParseEq("\\c!", "'\\c!'");
233  CheckParseEq("\\c_", "'\\c_'");
234  CheckParseEq("\\c~", "'\\c~'");
235  CheckParseEq("\\c1", "'\\c1'");
236  // Control character inside character class.
237  CheckParseEq("[\\c!]", "[\\ c !]");
238  CheckParseEq("[\\c_]", "[\\x1f]");
239  CheckParseEq("[\\c~]", "[\\ c ~]");
240  CheckParseEq("[\\ca]", "[\\x01]");
241  CheckParseEq("[\\cz]", "[\\x1a]");
242  CheckParseEq("[\\cA]", "[\\x01]");
243  CheckParseEq("[\\cZ]", "[\\x1a]");
244  CheckParseEq("[\\c1]", "[\\x11]");
245
246  CheckParseEq("[a\\]c]", "[a ] c]");
247  CheckParseEq("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
248  CheckParseEq("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ #  ]");
249  CheckParseEq("\\0", "'\\x00'");
250  CheckParseEq("\\8", "'8'");
251  CheckParseEq("\\9", "'9'");
252  CheckParseEq("\\11", "'\\x09'");
253  CheckParseEq("\\11a", "'\\x09a'");
254  CheckParseEq("\\011", "'\\x09'");
255  CheckParseEq("\\00011", "'\\x0011'");
256  CheckParseEq("\\118", "'\\x098'");
257  CheckParseEq("\\111", "'I'");
258  CheckParseEq("\\1111", "'I1'");
259  CheckParseEq("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
260  CheckParseEq("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
261  CheckParseEq("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
262  CheckParseEq("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
263  CheckParseEq("(x)(x)(x)\\1*",
264               "(: (^ 'x') (^ 'x') (^ 'x')"
265               " (# 0 - g (<- 1)))");
266  CheckParseEq("(x)(x)(x)\\2*",
267               "(: (^ 'x') (^ 'x') (^ 'x')"
268               " (# 0 - g (<- 2)))");
269  CheckParseEq("(x)(x)(x)\\3*",
270               "(: (^ 'x') (^ 'x') (^ 'x')"
271               " (# 0 - g (<- 3)))");
272  CheckParseEq("(x)(x)(x)\\4*",
273               "(: (^ 'x') (^ 'x') (^ 'x')"
274               " (# 0 - g '\\x04'))");
275  CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
276               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
277               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
278  CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
279               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
280               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
281  CheckParseEq("(a)\\1", "(: (^ 'a') (<- 1))");
282  CheckParseEq("(a\\1)", "(^ 'a')");
283  CheckParseEq("(\\1a)", "(^ 'a')");
284  CheckParseEq("(\\2)(\\1)", "(: (^ (<- 2)) (^ (<- 1)))");
285  CheckParseEq("(?=a)?a", "'a'");
286  CheckParseEq("(?=a){0,10}a", "'a'");
287  CheckParseEq("(?=a){1,10}a", "(: (-> + 'a') 'a')");
288  CheckParseEq("(?=a){9,10}a", "(: (-> + 'a') 'a')");
289  CheckParseEq("(?!a)?a", "'a'");
290  CheckParseEq("\\1(a)", "(: (<- 1) (^ 'a'))");
291  CheckParseEq("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
292  CheckParseEq("(?!\\1(a\\1)\\1)\\1",
293               "(: (-> - (: (<- 1) (^ 'a') (<- 1))) (<- 1))");
294  CheckParseEq("\\1\\2(a(?:\\1(b\\1\\2))\\2)\\1",
295               "(: (<- 1) (<- 2) (^ (: 'a' (^ 'b') (<- 2))) (<- 1))");
296  if (lookbehind) {
297    CheckParseEq("\\1\\2(a(?<=\\1(b\\1\\2))\\2)\\1",
298                 "(: (<- 1) (<- 2) (^ (: 'a' (<- + (^ 'b')) (<- 2))) (<- 1))");
299  }
300  CheckParseEq("[\\0]", "[\\x00]");
301  CheckParseEq("[\\11]", "[\\x09]");
302  CheckParseEq("[\\11a]", "[\\x09 a]");
303  CheckParseEq("[\\011]", "[\\x09]");
304  CheckParseEq("[\\00011]", "[\\x00 1 1]");
305  CheckParseEq("[\\118]", "[\\x09 8]");
306  CheckParseEq("[\\111]", "[I]");
307  CheckParseEq("[\\1111]", "[I 1]");
308  CheckParseEq("\\x34", "'\x34'");
309  CheckParseEq("\\x60", "'\x60'");
310  CheckParseEq("\\x3z", "'x3z'");
311  CheckParseEq("\\c", "'\\c'");
312  CheckParseEq("\\u0034", "'\x34'");
313  CheckParseEq("\\u003z", "'u003z'");
314  CheckParseEq("foo[z]*", "(: 'foo' (# 0 - g [z]))");
315
316  // Unicode regexps
317  CheckParseEq("\\u{12345}", "'\\ud808\\udf45'", true);
318  CheckParseEq("\\u{12345}\\u{23456}", "(! '\\ud808\\udf45' '\\ud84d\\udc56')",
319               true);
320  CheckParseEq("\\u{12345}|\\u{23456}", "(| '\\ud808\\udf45' '\\ud84d\\udc56')",
321               true);
322  CheckParseEq("\\u{12345}{3}", "(# 3 3 g '\\ud808\\udf45')", true);
323  CheckParseEq("\\u{12345}*", "(# 0 - g '\\ud808\\udf45')", true);
324
325  CheckParseEq("\\ud808\\udf45*", "(# 0 - g '\\ud808\\udf45')", true);
326  CheckParseEq("[\\ud808\\udf45-\\ud809\\udccc]", "[\\u{012345}-\\u{0124cc}]",
327               true);
328
329  CHECK_SIMPLE("", false);
330  CHECK_SIMPLE("a", true);
331  CHECK_SIMPLE("a|b", false);
332  CHECK_SIMPLE("a\\n", false);
333  CHECK_SIMPLE("^a", false);
334  CHECK_SIMPLE("a$", false);
335  CHECK_SIMPLE("a\\b!", false);
336  CHECK_SIMPLE("a\\Bb", false);
337  CHECK_SIMPLE("a*", false);
338  CHECK_SIMPLE("a*?", false);
339  CHECK_SIMPLE("a?", false);
340  CHECK_SIMPLE("a??", false);
341  CHECK_SIMPLE("a{0,1}?", false);
342  CHECK_SIMPLE("a{1,1}?", false);
343  CHECK_SIMPLE("a{1,2}?", false);
344  CHECK_SIMPLE("a+?", false);
345  CHECK_SIMPLE("(a)", false);
346  CHECK_SIMPLE("(a)\\1", false);
347  CHECK_SIMPLE("(\\1a)", false);
348  CHECK_SIMPLE("\\1(a)", false);
349  CHECK_SIMPLE("a\\s", false);
350  CHECK_SIMPLE("a\\S", false);
351  CHECK_SIMPLE("a\\d", false);
352  CHECK_SIMPLE("a\\D", false);
353  CHECK_SIMPLE("a\\w", false);
354  CHECK_SIMPLE("a\\W", false);
355  CHECK_SIMPLE("a.", false);
356  CHECK_SIMPLE("a\\q", false);
357  CHECK_SIMPLE("a[a]", false);
358  CHECK_SIMPLE("a[^a]", false);
359  CHECK_SIMPLE("a[a-z]", false);
360  CHECK_SIMPLE("a[\\q]", false);
361  CHECK_SIMPLE("a(?:b)", false);
362  CHECK_SIMPLE("a(?=b)", false);
363  CHECK_SIMPLE("a(?!b)", false);
364  CHECK_SIMPLE("\\x60", false);
365  CHECK_SIMPLE("\\u0060", false);
366  CHECK_SIMPLE("\\cA", false);
367  CHECK_SIMPLE("\\q", false);
368  CHECK_SIMPLE("\\1112", false);
369  CHECK_SIMPLE("\\0", false);
370  CHECK_SIMPLE("(a)\\1", false);
371  CHECK_SIMPLE("(?=a)?a", false);
372  CHECK_SIMPLE("(?!a)?a\\1", false);
373  CHECK_SIMPLE("(?:(?=a))a\\1", false);
374
375  CheckParseEq("a{}", "'a{}'");
376  CheckParseEq("a{,}", "'a{,}'");
377  CheckParseEq("a{", "'a{'");
378  CheckParseEq("a{z}", "'a{z}'");
379  CheckParseEq("a{1z}", "'a{1z}'");
380  CheckParseEq("a{12z}", "'a{12z}'");
381  CheckParseEq("a{12,", "'a{12,'");
382  CheckParseEq("a{12,3b", "'a{12,3b'");
383  CheckParseEq("{}", "'{}'");
384  CheckParseEq("{,}", "'{,}'");
385  CheckParseEq("{", "'{'");
386  CheckParseEq("{z}", "'{z}'");
387  CheckParseEq("{1z}", "'{1z}'");
388  CheckParseEq("{12z}", "'{12z}'");
389  CheckParseEq("{12,", "'{12,'");
390  CheckParseEq("{12,3b", "'{12,3b'");
391
392  CHECK_MIN_MAX("a", 1, 1);
393  CHECK_MIN_MAX("abc", 3, 3);
394  CHECK_MIN_MAX("a[bc]d", 3, 3);
395  CHECK_MIN_MAX("a|bc", 1, 2);
396  CHECK_MIN_MAX("ab|c", 1, 2);
397  CHECK_MIN_MAX("a||bc", 0, 2);
398  CHECK_MIN_MAX("|", 0, 0);
399  CHECK_MIN_MAX("(?:ab)", 2, 2);
400  CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
401  CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
402  CHECK_MIN_MAX("(ab)", 2, 2);
403  CHECK_MIN_MAX("(ab|cde)", 2, 3);
404  CHECK_MIN_MAX("(ab)\\1", 2, RegExpTree::kInfinity);
405  CHECK_MIN_MAX("(ab|cde)\\1", 2, RegExpTree::kInfinity);
406  CHECK_MIN_MAX("(?:ab)?", 0, 2);
407  CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
408  CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
409  CHECK_MIN_MAX("a?", 0, 1);
410  CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
411  CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
412  CHECK_MIN_MAX("a??", 0, 1);
413  CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
414  CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
415  CHECK_MIN_MAX("(?:a?)?", 0, 1);
416  CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
417  CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
418  CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
419  CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
420  CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
421  CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
422  CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
423  CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
424  CHECK_MIN_MAX("a{0}", 0, 0);
425  CHECK_MIN_MAX("(?:a+){0}", 0, 0);
426  CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
427  CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
428  CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
429  CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
430  CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
431  CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
432  CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
433  CHECK_MIN_MAX("a\\bc", 2, 2);
434  CHECK_MIN_MAX("a\\Bc", 2, 2);
435  CHECK_MIN_MAX("a\\sc", 3, 3);
436  CHECK_MIN_MAX("a\\Sc", 3, 3);
437  CHECK_MIN_MAX("a(?=b)c", 2, 2);
438  CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
439  CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
440
441  FLAG_harmony_regexp_named_captures = true;
442  CheckParseEq("(?<a>x)(?<b>x)(?<c>x)\\k<a>",
443               "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))", true);
444  CheckParseEq("(?<a>x)(?<b>x)(?<c>x)\\k<b>",
445               "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))", true);
446  CheckParseEq("(?<a>x)(?<b>x)(?<c>x)\\k<c>",
447               "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))", true);
448  CheckParseEq("(?<a>a)\\k<a>", "(: (^ 'a') (<- 1))", true);
449  CheckParseEq("(?<a>a\\k<a>)", "(^ 'a')", true);
450  CheckParseEq("(?<a>\\k<a>a)", "(^ 'a')", true);
451  CheckParseEq("(?<a>\\k<b>)(?<b>\\k<a>)", "(: (^ (<- 2)) (^ (<- 1)))", true);
452  CheckParseEq("\\k<a>(?<a>a)", "(: (<- 1) (^ 'a'))", true);
453
454  CheckParseEq("(?<\\u{03C0}>a)", "(^ 'a')", true);
455  CheckParseEq("(?<\\u03C0>a)", "(^ 'a')", true);
456  FLAG_harmony_regexp_named_captures = false;
457}
458
459
460TEST(ParserWithLookbehind) {
461  TestRegExpParser(true);  // Lookbehind enabled.
462}
463
464
465TEST(ParserWithoutLookbehind) {
466  TestRegExpParser(true);  // Lookbehind enabled.
467}
468
469TEST(ParserRegression) {
470  CheckParseEq("[A-Z$-][x]", "(! [A-Z $ -] [x])");
471  CheckParseEq("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
472  CheckParseEq("{", "'{'");
473  CheckParseEq("a|", "(| 'a' %)");
474}
475
476static void ExpectError(const char* input, const char* expected,
477                        bool unicode = false) {
478  v8::HandleScope scope(CcTest::isolate());
479  Zone zone(CcTest::i_isolate()->allocator());
480  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
481  RegExpCompileData result;
482  JSRegExp::Flags flags = JSRegExp::kNone;
483  if (unicode) flags |= JSRegExp::kUnicode;
484  CHECK(!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), &zone,
485                                                 &reader, flags, &result));
486  CHECK(result.tree == NULL);
487  CHECK(!result.error.is_null());
488  v8::base::SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
489  CHECK_EQ(0, strcmp(expected, str.get()));
490}
491
492
493TEST(Errors) {
494  const char* kEndBackslash = "\\ at end of pattern";
495  ExpectError("\\", kEndBackslash);
496  const char* kUnterminatedGroup = "Unterminated group";
497  ExpectError("(foo", kUnterminatedGroup);
498  const char* kInvalidGroup = "Invalid group";
499  ExpectError("(?", kInvalidGroup);
500  const char* kUnterminatedCharacterClass = "Unterminated character class";
501  ExpectError("[", kUnterminatedCharacterClass);
502  ExpectError("[a-", kUnterminatedCharacterClass);
503  const char* kNothingToRepeat = "Nothing to repeat";
504  ExpectError("*", kNothingToRepeat);
505  ExpectError("?", kNothingToRepeat);
506  ExpectError("+", kNothingToRepeat);
507  ExpectError("{1}", kNothingToRepeat);
508  ExpectError("{1,2}", kNothingToRepeat);
509  ExpectError("{1,}", kNothingToRepeat);
510
511  // Check that we don't allow more than kMaxCapture captures
512  const int kMaxCaptures = 1 << 16;  // Must match RegExpParser::kMaxCaptures.
513  const char* kTooManyCaptures = "Too many captures";
514  std::ostringstream os;
515  for (int i = 0; i <= kMaxCaptures; i++) {
516    os << "()";
517  }
518  ExpectError(os.str().c_str(), kTooManyCaptures);
519
520  FLAG_harmony_regexp_named_captures = true;
521  const char* kInvalidCaptureName = "Invalid capture group name";
522  ExpectError("(?<>.)", kInvalidCaptureName, true);
523  ExpectError("(?<1>.)", kInvalidCaptureName, true);
524  ExpectError("(?<_%>.)", kInvalidCaptureName, true);
525  ExpectError("\\k<a", kInvalidCaptureName, true);
526  const char* kDuplicateCaptureName = "Duplicate capture group name";
527  ExpectError("(?<a>.)(?<a>.)", kDuplicateCaptureName, true);
528  const char* kInvalidUnicodeEscape = "Invalid Unicode escape sequence";
529  ExpectError("(?<\\u{FISK}", kInvalidUnicodeEscape, true);
530  const char* kInvalidCaptureReferenced = "Invalid named capture referenced";
531  ExpectError("\\k<a>", kInvalidCaptureReferenced, true);
532  ExpectError("(?<b>)\\k<a>", kInvalidCaptureReferenced, true);
533  const char* kInvalidNamedReference = "Invalid named reference";
534  ExpectError("\\ka", kInvalidNamedReference, true);
535  FLAG_harmony_regexp_named_captures = false;
536}
537
538
539static bool IsDigit(uc16 c) {
540  return ('0' <= c && c <= '9');
541}
542
543
544static bool NotDigit(uc16 c) {
545  return !IsDigit(c);
546}
547
548
549static bool IsWhiteSpaceOrLineTerminator(uc16 c) {
550  // According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
551  // WhiteSpace (7.2) and LineTerminator (7.3) values.
552  return v8::internal::WhiteSpaceOrLineTerminator::Is(c);
553}
554
555
556static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
557  return !IsWhiteSpaceOrLineTerminator(c);
558}
559
560
561static bool NotWord(uc16 c) {
562  return !IsRegExpWord(c);
563}
564
565
566static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
567  Zone zone(CcTest::i_isolate()->allocator());
568  ZoneList<CharacterRange>* ranges =
569      new(&zone) ZoneList<CharacterRange>(2, &zone);
570  CharacterRange::AddClassEscape(c, ranges, &zone);
571  for (uc32 i = 0; i < (1 << 16); i++) {
572    bool in_class = false;
573    for (int j = 0; !in_class && j < ranges->length(); j++) {
574      CharacterRange& range = ranges->at(j);
575      in_class = (range.from() <= i && i <= range.to());
576    }
577    CHECK_EQ(pred(i), in_class);
578  }
579}
580
581
582TEST(CharacterClassEscapes) {
583  TestCharacterClassEscapes('.', IsRegExpNewline);
584  TestCharacterClassEscapes('d', IsDigit);
585  TestCharacterClassEscapes('D', NotDigit);
586  TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
587  TestCharacterClassEscapes('S', NotWhiteSpaceNorLineTermiantor);
588  TestCharacterClassEscapes('w', IsRegExpWord);
589  TestCharacterClassEscapes('W', NotWord);
590}
591
592
593static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
594                           bool is_one_byte, Zone* zone) {
595  Isolate* isolate = CcTest::i_isolate();
596  FlatStringReader reader(isolate, CStrVector(input));
597  RegExpCompileData compile_data;
598  JSRegExp::Flags flags = JSRegExp::kNone;
599  if (multiline) flags = JSRegExp::kMultiline;
600  if (unicode) flags = JSRegExp::kUnicode;
601  if (!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), zone,
602                                               &reader, flags, &compile_data))
603    return NULL;
604  Handle<String> pattern = isolate->factory()
605                               ->NewStringFromUtf8(CStrVector(input))
606                               .ToHandleChecked();
607  Handle<String> sample_subject =
608      isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
609  RegExpEngine::Compile(isolate, zone, &compile_data, flags, pattern,
610                        sample_subject, is_one_byte);
611  return compile_data.node;
612}
613
614
615static void Execute(const char* input, bool multiline, bool unicode,
616                    bool is_one_byte, bool dot_output = false) {
617  v8::HandleScope scope(CcTest::isolate());
618  Zone zone(CcTest::i_isolate()->allocator());
619  RegExpNode* node = Compile(input, multiline, unicode, is_one_byte, &zone);
620  USE(node);
621#ifdef DEBUG
622  if (dot_output) {
623    RegExpEngine::DotPrint(input, node, false);
624  }
625#endif  // DEBUG
626}
627
628
629class TestConfig {
630 public:
631  typedef int Key;
632  typedef int Value;
633  static const int kNoKey;
634  static int NoValue() { return 0; }
635  static inline int Compare(int a, int b) {
636    if (a < b)
637      return -1;
638    else if (a > b)
639      return 1;
640    else
641      return 0;
642  }
643};
644
645
646const int TestConfig::kNoKey = 0;
647
648
649static unsigned PseudoRandom(int i, int j) {
650  return ~(~((i * 781) ^ (j * 329)));
651}
652
653
654TEST(SplayTreeSimple) {
655  static const unsigned kLimit = 1000;
656  Zone zone(CcTest::i_isolate()->allocator());
657  ZoneSplayTree<TestConfig> tree(&zone);
658  bool seen[kLimit];
659  for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
660#define CHECK_MAPS_EQUAL() do {                                      \
661    for (unsigned k = 0; k < kLimit; k++)                            \
662      CHECK_EQ(seen[k], tree.Find(k, &loc));                         \
663  } while (false)
664  for (int i = 0; i < 50; i++) {
665    for (int j = 0; j < 50; j++) {
666      int next = PseudoRandom(i, j) % kLimit;
667      if (seen[next]) {
668        // We've already seen this one.  Check the value and remove
669        // it.
670        ZoneSplayTree<TestConfig>::Locator loc;
671        CHECK(tree.Find(next, &loc));
672        CHECK_EQ(next, loc.key());
673        CHECK_EQ(3 * next, loc.value());
674        tree.Remove(next);
675        seen[next] = false;
676        CHECK_MAPS_EQUAL();
677      } else {
678        // Check that it wasn't there already and then add it.
679        ZoneSplayTree<TestConfig>::Locator loc;
680        CHECK(!tree.Find(next, &loc));
681        CHECK(tree.Insert(next, &loc));
682        CHECK_EQ(next, loc.key());
683        loc.set_value(3 * next);
684        seen[next] = true;
685        CHECK_MAPS_EQUAL();
686      }
687      int val = PseudoRandom(j, i) % kLimit;
688      if (seen[val]) {
689        ZoneSplayTree<TestConfig>::Locator loc;
690        CHECK(tree.FindGreatestLessThan(val, &loc));
691        CHECK_EQ(loc.key(), val);
692        break;
693      }
694      val = PseudoRandom(i + j, i - j) % kLimit;
695      if (seen[val]) {
696        ZoneSplayTree<TestConfig>::Locator loc;
697        CHECK(tree.FindLeastGreaterThan(val, &loc));
698        CHECK_EQ(loc.key(), val);
699        break;
700      }
701    }
702  }
703}
704
705
706TEST(DispatchTableConstruction) {
707  // Initialize test data.
708  static const int kLimit = 1000;
709  static const int kRangeCount = 8;
710  static const int kRangeSize = 16;
711  uc16 ranges[kRangeCount][2 * kRangeSize];
712  for (int i = 0; i < kRangeCount; i++) {
713    Vector<uc16> range(ranges[i], 2 * kRangeSize);
714    for (int j = 0; j < 2 * kRangeSize; j++) {
715      range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
716    }
717    range.Sort();
718    for (int j = 1; j < 2 * kRangeSize; j++) {
719      CHECK(range[j-1] <= range[j]);
720    }
721  }
722  // Enter test data into dispatch table.
723  Zone zone(CcTest::i_isolate()->allocator());
724  DispatchTable table(&zone);
725  for (int i = 0; i < kRangeCount; i++) {
726    uc16* range = ranges[i];
727    for (int j = 0; j < 2 * kRangeSize; j += 2)
728      table.AddRange(CharacterRange::Range(range[j], range[j + 1]), i, &zone);
729  }
730  // Check that the table looks as we would expect
731  for (int p = 0; p < kLimit; p++) {
732    OutSet* outs = table.Get(p);
733    for (int j = 0; j < kRangeCount; j++) {
734      uc16* range = ranges[j];
735      bool is_on = false;
736      for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
737        is_on = (range[k] <= p && p <= range[k + 1]);
738      CHECK_EQ(is_on, outs->Get(j));
739    }
740  }
741}
742
743
744// Test of debug-only syntax.
745#ifdef DEBUG
746
747TEST(ParsePossessiveRepetition) {
748  bool old_flag_value = FLAG_regexp_possessive_quantifier;
749
750  // Enable possessive quantifier syntax.
751  FLAG_regexp_possessive_quantifier = true;
752
753  CheckParseEq("a*+", "(# 0 - p 'a')");
754  CheckParseEq("a++", "(# 1 - p 'a')");
755  CheckParseEq("a?+", "(# 0 1 p 'a')");
756  CheckParseEq("a{10,20}+", "(# 10 20 p 'a')");
757  CheckParseEq("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
758
759  // Disable possessive quantifier syntax.
760  FLAG_regexp_possessive_quantifier = false;
761
762  CHECK_PARSE_ERROR("a*+");
763  CHECK_PARSE_ERROR("a++");
764  CHECK_PARSE_ERROR("a?+");
765  CHECK_PARSE_ERROR("a{10,20}+");
766  CHECK_PARSE_ERROR("a{10,20}+b");
767
768  FLAG_regexp_possessive_quantifier = old_flag_value;
769}
770
771#endif
772
773// Tests of interpreter.
774
775
776#ifndef V8_INTERPRETED_REGEXP
777
778#if V8_TARGET_ARCH_IA32
779typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
780#elif V8_TARGET_ARCH_X64
781typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
782#elif V8_TARGET_ARCH_ARM
783typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
784#elif V8_TARGET_ARCH_ARM64
785typedef RegExpMacroAssemblerARM64 ArchRegExpMacroAssembler;
786#elif V8_TARGET_ARCH_S390
787typedef RegExpMacroAssemblerS390 ArchRegExpMacroAssembler;
788#elif V8_TARGET_ARCH_PPC
789typedef RegExpMacroAssemblerPPC ArchRegExpMacroAssembler;
790#elif V8_TARGET_ARCH_MIPS
791typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
792#elif V8_TARGET_ARCH_MIPS64
793typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
794#elif V8_TARGET_ARCH_X87
795typedef RegExpMacroAssemblerX87 ArchRegExpMacroAssembler;
796#endif
797
798class ContextInitializer {
799 public:
800  ContextInitializer()
801      : scope_(CcTest::isolate()),
802        env_(v8::Context::New(CcTest::isolate())) {
803    env_->Enter();
804  }
805  ~ContextInitializer() {
806    env_->Exit();
807  }
808 private:
809  v8::HandleScope scope_;
810  v8::Local<v8::Context> env_;
811};
812
813
814static ArchRegExpMacroAssembler::Result Execute(Code* code,
815                                                String* input,
816                                                int start_offset,
817                                                const byte* input_start,
818                                                const byte* input_end,
819                                                int* captures) {
820  return NativeRegExpMacroAssembler::Execute(
821      code,
822      input,
823      start_offset,
824      input_start,
825      input_end,
826      captures,
827      0,
828      CcTest::i_isolate());
829}
830
831
832TEST(MacroAssemblerNativeSuccess) {
833  v8::V8::Initialize();
834  ContextInitializer initializer;
835  Isolate* isolate = CcTest::i_isolate();
836  Factory* factory = isolate->factory();
837  Zone zone(CcTest::i_isolate()->allocator());
838
839  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
840                             4);
841
842  m.Succeed();
843
844  Handle<String> source = factory->NewStringFromStaticChars("");
845  Handle<Object> code_object = m.GetCode(source);
846  Handle<Code> code = Handle<Code>::cast(code_object);
847
848  int captures[4] = {42, 37, 87, 117};
849  Handle<String> input = factory->NewStringFromStaticChars("foofoo");
850  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
851  const byte* start_adr =
852      reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
853
854  NativeRegExpMacroAssembler::Result result =
855      Execute(*code,
856              *input,
857              0,
858              start_adr,
859              start_adr + seq_input->length(),
860              captures);
861
862  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
863  CHECK_EQ(-1, captures[0]);
864  CHECK_EQ(-1, captures[1]);
865  CHECK_EQ(-1, captures[2]);
866  CHECK_EQ(-1, captures[3]);
867}
868
869
870TEST(MacroAssemblerNativeSimple) {
871  v8::V8::Initialize();
872  ContextInitializer initializer;
873  Isolate* isolate = CcTest::i_isolate();
874  Factory* factory = isolate->factory();
875  Zone zone(CcTest::i_isolate()->allocator());
876
877  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
878                             4);
879
880  Label fail, backtrack;
881  m.PushBacktrack(&fail);
882  m.CheckNotAtStart(0, NULL);
883  m.LoadCurrentCharacter(2, NULL);
884  m.CheckNotCharacter('o', NULL);
885  m.LoadCurrentCharacter(1, NULL, false);
886  m.CheckNotCharacter('o', NULL);
887  m.LoadCurrentCharacter(0, NULL, false);
888  m.CheckNotCharacter('f', NULL);
889  m.WriteCurrentPositionToRegister(0, 0);
890  m.WriteCurrentPositionToRegister(1, 3);
891  m.AdvanceCurrentPosition(3);
892  m.PushBacktrack(&backtrack);
893  m.Succeed();
894  m.Bind(&backtrack);
895  m.Backtrack();
896  m.Bind(&fail);
897  m.Fail();
898
899  Handle<String> source = factory->NewStringFromStaticChars("^foo");
900  Handle<Object> code_object = m.GetCode(source);
901  Handle<Code> code = Handle<Code>::cast(code_object);
902
903  int captures[4] = {42, 37, 87, 117};
904  Handle<String> input = factory->NewStringFromStaticChars("foofoo");
905  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
906  Address start_adr = seq_input->GetCharsAddress();
907
908  NativeRegExpMacroAssembler::Result result =
909      Execute(*code,
910              *input,
911              0,
912              start_adr,
913              start_adr + input->length(),
914              captures);
915
916  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
917  CHECK_EQ(0, captures[0]);
918  CHECK_EQ(3, captures[1]);
919  CHECK_EQ(-1, captures[2]);
920  CHECK_EQ(-1, captures[3]);
921
922  input = factory->NewStringFromStaticChars("barbarbar");
923  seq_input = Handle<SeqOneByteString>::cast(input);
924  start_adr = seq_input->GetCharsAddress();
925
926  result = Execute(*code,
927                   *input,
928                   0,
929                   start_adr,
930                   start_adr + input->length(),
931                   captures);
932
933  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
934}
935
936
937TEST(MacroAssemblerNativeSimpleUC16) {
938  v8::V8::Initialize();
939  ContextInitializer initializer;
940  Isolate* isolate = CcTest::i_isolate();
941  Factory* factory = isolate->factory();
942  Zone zone(CcTest::i_isolate()->allocator());
943
944  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::UC16,
945                             4);
946
947  Label fail, backtrack;
948  m.PushBacktrack(&fail);
949  m.CheckNotAtStart(0, NULL);
950  m.LoadCurrentCharacter(2, NULL);
951  m.CheckNotCharacter('o', NULL);
952  m.LoadCurrentCharacter(1, NULL, false);
953  m.CheckNotCharacter('o', NULL);
954  m.LoadCurrentCharacter(0, NULL, false);
955  m.CheckNotCharacter('f', NULL);
956  m.WriteCurrentPositionToRegister(0, 0);
957  m.WriteCurrentPositionToRegister(1, 3);
958  m.AdvanceCurrentPosition(3);
959  m.PushBacktrack(&backtrack);
960  m.Succeed();
961  m.Bind(&backtrack);
962  m.Backtrack();
963  m.Bind(&fail);
964  m.Fail();
965
966  Handle<String> source = factory->NewStringFromStaticChars("^foo");
967  Handle<Object> code_object = m.GetCode(source);
968  Handle<Code> code = Handle<Code>::cast(code_object);
969
970  int captures[4] = {42, 37, 87, 117};
971  const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o',
972                              static_cast<uc16>(0x2603)};
973  Handle<String> input = factory->NewStringFromTwoByte(
974      Vector<const uc16>(input_data, 6)).ToHandleChecked();
975  Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
976  Address start_adr = seq_input->GetCharsAddress();
977
978  NativeRegExpMacroAssembler::Result result =
979      Execute(*code,
980              *input,
981              0,
982              start_adr,
983              start_adr + input->length(),
984              captures);
985
986  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
987  CHECK_EQ(0, captures[0]);
988  CHECK_EQ(3, captures[1]);
989  CHECK_EQ(-1, captures[2]);
990  CHECK_EQ(-1, captures[3]);
991
992  const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a',
993                               static_cast<uc16>(0x2603)};
994  input = factory->NewStringFromTwoByte(
995      Vector<const uc16>(input_data2, 9)).ToHandleChecked();
996  seq_input = Handle<SeqTwoByteString>::cast(input);
997  start_adr = seq_input->GetCharsAddress();
998
999  result = Execute(*code,
1000                   *input,
1001                   0,
1002                   start_adr,
1003                   start_adr + input->length() * 2,
1004                   captures);
1005
1006  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
1007}
1008
1009
1010TEST(MacroAssemblerNativeBacktrack) {
1011  v8::V8::Initialize();
1012  ContextInitializer initializer;
1013  Isolate* isolate = CcTest::i_isolate();
1014  Factory* factory = isolate->factory();
1015  Zone zone(CcTest::i_isolate()->allocator());
1016
1017  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1018                             0);
1019
1020  Label fail;
1021  Label backtrack;
1022  m.LoadCurrentCharacter(10, &fail);
1023  m.Succeed();
1024  m.Bind(&fail);
1025  m.PushBacktrack(&backtrack);
1026  m.LoadCurrentCharacter(10, NULL);
1027  m.Succeed();
1028  m.Bind(&backtrack);
1029  m.Fail();
1030
1031  Handle<String> source = factory->NewStringFromStaticChars("..........");
1032  Handle<Object> code_object = m.GetCode(source);
1033  Handle<Code> code = Handle<Code>::cast(code_object);
1034
1035  Handle<String> input = factory->NewStringFromStaticChars("foofoo");
1036  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1037  Address start_adr = seq_input->GetCharsAddress();
1038
1039  NativeRegExpMacroAssembler::Result result =
1040      Execute(*code,
1041              *input,
1042              0,
1043              start_adr,
1044              start_adr + input->length(),
1045              NULL);
1046
1047  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
1048}
1049
1050
1051TEST(MacroAssemblerNativeBackReferenceLATIN1) {
1052  v8::V8::Initialize();
1053  ContextInitializer initializer;
1054  Isolate* isolate = CcTest::i_isolate();
1055  Factory* factory = isolate->factory();
1056  Zone zone(CcTest::i_isolate()->allocator());
1057
1058  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1059                             4);
1060
1061  m.WriteCurrentPositionToRegister(0, 0);
1062  m.AdvanceCurrentPosition(2);
1063  m.WriteCurrentPositionToRegister(1, 0);
1064  Label nomatch;
1065  m.CheckNotBackReference(0, false, &nomatch);
1066  m.Fail();
1067  m.Bind(&nomatch);
1068  m.AdvanceCurrentPosition(2);
1069  Label missing_match;
1070  m.CheckNotBackReference(0, false, &missing_match);
1071  m.WriteCurrentPositionToRegister(2, 0);
1072  m.Succeed();
1073  m.Bind(&missing_match);
1074  m.Fail();
1075
1076  Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
1077  Handle<Object> code_object = m.GetCode(source);
1078  Handle<Code> code = Handle<Code>::cast(code_object);
1079
1080  Handle<String> input = factory->NewStringFromStaticChars("fooofo");
1081  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1082  Address start_adr = seq_input->GetCharsAddress();
1083
1084  int output[4];
1085  NativeRegExpMacroAssembler::Result result =
1086      Execute(*code,
1087              *input,
1088              0,
1089              start_adr,
1090              start_adr + input->length(),
1091              output);
1092
1093  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1094  CHECK_EQ(0, output[0]);
1095  CHECK_EQ(2, output[1]);
1096  CHECK_EQ(6, output[2]);
1097  CHECK_EQ(-1, output[3]);
1098}
1099
1100
1101TEST(MacroAssemblerNativeBackReferenceUC16) {
1102  v8::V8::Initialize();
1103  ContextInitializer initializer;
1104  Isolate* isolate = CcTest::i_isolate();
1105  Factory* factory = isolate->factory();
1106  Zone zone(CcTest::i_isolate()->allocator());
1107
1108  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::UC16,
1109                             4);
1110
1111  m.WriteCurrentPositionToRegister(0, 0);
1112  m.AdvanceCurrentPosition(2);
1113  m.WriteCurrentPositionToRegister(1, 0);
1114  Label nomatch;
1115  m.CheckNotBackReference(0, false, &nomatch);
1116  m.Fail();
1117  m.Bind(&nomatch);
1118  m.AdvanceCurrentPosition(2);
1119  Label missing_match;
1120  m.CheckNotBackReference(0, false, &missing_match);
1121  m.WriteCurrentPositionToRegister(2, 0);
1122  m.Succeed();
1123  m.Bind(&missing_match);
1124  m.Fail();
1125
1126  Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
1127  Handle<Object> code_object = m.GetCode(source);
1128  Handle<Code> code = Handle<Code>::cast(code_object);
1129
1130  const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
1131  Handle<String> input = factory->NewStringFromTwoByte(
1132      Vector<const uc16>(input_data, 6)).ToHandleChecked();
1133  Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
1134  Address start_adr = seq_input->GetCharsAddress();
1135
1136  int output[4];
1137  NativeRegExpMacroAssembler::Result result =
1138      Execute(*code,
1139              *input,
1140              0,
1141              start_adr,
1142              start_adr + input->length() * 2,
1143              output);
1144
1145  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1146  CHECK_EQ(0, output[0]);
1147  CHECK_EQ(2, output[1]);
1148  CHECK_EQ(6, output[2]);
1149  CHECK_EQ(-1, output[3]);
1150}
1151
1152
1153
1154TEST(MacroAssemblernativeAtStart) {
1155  v8::V8::Initialize();
1156  ContextInitializer initializer;
1157  Isolate* isolate = CcTest::i_isolate();
1158  Factory* factory = isolate->factory();
1159  Zone zone(CcTest::i_isolate()->allocator());
1160
1161  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1162                             0);
1163
1164  Label not_at_start, newline, fail;
1165  m.CheckNotAtStart(0, &not_at_start);
1166  // Check that prevchar = '\n' and current = 'f'.
1167  m.CheckCharacter('\n', &newline);
1168  m.Bind(&fail);
1169  m.Fail();
1170  m.Bind(&newline);
1171  m.LoadCurrentCharacter(0, &fail);
1172  m.CheckNotCharacter('f', &fail);
1173  m.Succeed();
1174
1175  m.Bind(&not_at_start);
1176  // Check that prevchar = 'o' and current = 'b'.
1177  Label prevo;
1178  m.CheckCharacter('o', &prevo);
1179  m.Fail();
1180  m.Bind(&prevo);
1181  m.LoadCurrentCharacter(0, &fail);
1182  m.CheckNotCharacter('b', &fail);
1183  m.Succeed();
1184
1185  Handle<String> source = factory->NewStringFromStaticChars("(^f|ob)");
1186  Handle<Object> code_object = m.GetCode(source);
1187  Handle<Code> code = Handle<Code>::cast(code_object);
1188
1189  Handle<String> input = factory->NewStringFromStaticChars("foobar");
1190  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1191  Address start_adr = seq_input->GetCharsAddress();
1192
1193  NativeRegExpMacroAssembler::Result result =
1194      Execute(*code,
1195              *input,
1196              0,
1197              start_adr,
1198              start_adr + input->length(),
1199              NULL);
1200
1201  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1202
1203  result = Execute(*code,
1204                   *input,
1205                   3,
1206                   start_adr + 3,
1207                   start_adr + input->length(),
1208                   NULL);
1209
1210  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1211}
1212
1213
1214TEST(MacroAssemblerNativeBackRefNoCase) {
1215  v8::V8::Initialize();
1216  ContextInitializer initializer;
1217  Isolate* isolate = CcTest::i_isolate();
1218  Factory* factory = isolate->factory();
1219  Zone zone(CcTest::i_isolate()->allocator());
1220
1221  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1222                             4);
1223
1224  Label fail, succ;
1225
1226  m.WriteCurrentPositionToRegister(0, 0);
1227  m.WriteCurrentPositionToRegister(2, 0);
1228  m.AdvanceCurrentPosition(3);
1229  m.WriteCurrentPositionToRegister(3, 0);
1230  m.CheckNotBackReferenceIgnoreCase(2, false, false, &fail);  // Match "AbC".
1231  m.CheckNotBackReferenceIgnoreCase(2, false, false, &fail);  // Match "ABC".
1232  Label expected_fail;
1233  m.CheckNotBackReferenceIgnoreCase(2, false, false, &expected_fail);
1234  m.Bind(&fail);
1235  m.Fail();
1236
1237  m.Bind(&expected_fail);
1238  m.AdvanceCurrentPosition(3);  // Skip "xYz"
1239  m.CheckNotBackReferenceIgnoreCase(2, false, false, &succ);
1240  m.Fail();
1241
1242  m.Bind(&succ);
1243  m.WriteCurrentPositionToRegister(1, 0);
1244  m.Succeed();
1245
1246  Handle<String> source =
1247      factory->NewStringFromStaticChars("^(abc)\1\1(?!\1)...(?!\1)");
1248  Handle<Object> code_object = m.GetCode(source);
1249  Handle<Code> code = Handle<Code>::cast(code_object);
1250
1251  Handle<String> input = factory->NewStringFromStaticChars("aBcAbCABCxYzab");
1252  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1253  Address start_adr = seq_input->GetCharsAddress();
1254
1255  int output[4];
1256  NativeRegExpMacroAssembler::Result result =
1257      Execute(*code,
1258              *input,
1259              0,
1260              start_adr,
1261              start_adr + input->length(),
1262              output);
1263
1264  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1265  CHECK_EQ(0, output[0]);
1266  CHECK_EQ(12, output[1]);
1267  CHECK_EQ(0, output[2]);
1268  CHECK_EQ(3, output[3]);
1269}
1270
1271
1272
1273TEST(MacroAssemblerNativeRegisters) {
1274  v8::V8::Initialize();
1275  ContextInitializer initializer;
1276  Isolate* isolate = CcTest::i_isolate();
1277  Factory* factory = isolate->factory();
1278  Zone zone(CcTest::i_isolate()->allocator());
1279
1280  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1281                             6);
1282
1283  uc16 foo_chars[3] = {'f', 'o', 'o'};
1284  Vector<const uc16> foo(foo_chars, 3);
1285
1286  enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1287  Label fail;
1288  Label backtrack;
1289  m.WriteCurrentPositionToRegister(out1, 0);  // Output: [0]
1290  m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1291  m.PushBacktrack(&backtrack);
1292  m.WriteStackPointerToRegister(sp);
1293  // Fill stack and registers
1294  m.AdvanceCurrentPosition(2);
1295  m.WriteCurrentPositionToRegister(out1, 0);
1296  m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1297  m.PushBacktrack(&fail);
1298  // Drop backtrack stack frames.
1299  m.ReadStackPointerFromRegister(sp);
1300  // And take the first backtrack (to &backtrack)
1301  m.Backtrack();
1302
1303  m.PushCurrentPosition();
1304  m.AdvanceCurrentPosition(2);
1305  m.PopCurrentPosition();
1306
1307  m.Bind(&backtrack);
1308  m.PopRegister(out1);
1309  m.ReadCurrentPositionFromRegister(out1);
1310  m.AdvanceCurrentPosition(3);
1311  m.WriteCurrentPositionToRegister(out2, 0);  // [0,3]
1312
1313  Label loop;
1314  m.SetRegister(loop_cnt, 0);  // loop counter
1315  m.Bind(&loop);
1316  m.AdvanceRegister(loop_cnt, 1);
1317  m.AdvanceCurrentPosition(1);
1318  m.IfRegisterLT(loop_cnt, 3, &loop);
1319  m.WriteCurrentPositionToRegister(out3, 0);  // [0,3,6]
1320
1321  Label loop2;
1322  m.SetRegister(loop_cnt, 2);  // loop counter
1323  m.Bind(&loop2);
1324  m.AdvanceRegister(loop_cnt, -1);
1325  m.AdvanceCurrentPosition(1);
1326  m.IfRegisterGE(loop_cnt, 0, &loop2);
1327  m.WriteCurrentPositionToRegister(out4, 0);  // [0,3,6,9]
1328
1329  Label loop3;
1330  Label exit_loop3;
1331  m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1332  m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1333  m.ReadCurrentPositionFromRegister(out3);
1334  m.Bind(&loop3);
1335  m.AdvanceCurrentPosition(1);
1336  m.CheckGreedyLoop(&exit_loop3);
1337  m.GoTo(&loop3);
1338  m.Bind(&exit_loop3);
1339  m.PopCurrentPosition();
1340  m.WriteCurrentPositionToRegister(out5, 0);  // [0,3,6,9,9,-1]
1341
1342  m.Succeed();
1343
1344  m.Bind(&fail);
1345  m.Fail();
1346
1347  Handle<String> source = factory->NewStringFromStaticChars("<loop test>");
1348  Handle<Object> code_object = m.GetCode(source);
1349  Handle<Code> code = Handle<Code>::cast(code_object);
1350
1351  // String long enough for test (content doesn't matter).
1352  Handle<String> input = factory->NewStringFromStaticChars("foofoofoofoofoo");
1353  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1354  Address start_adr = seq_input->GetCharsAddress();
1355
1356  int output[6];
1357  NativeRegExpMacroAssembler::Result result =
1358      Execute(*code,
1359              *input,
1360              0,
1361              start_adr,
1362              start_adr + input->length(),
1363              output);
1364
1365  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1366  CHECK_EQ(0, output[0]);
1367  CHECK_EQ(3, output[1]);
1368  CHECK_EQ(6, output[2]);
1369  CHECK_EQ(9, output[3]);
1370  CHECK_EQ(9, output[4]);
1371  CHECK_EQ(-1, output[5]);
1372}
1373
1374
1375TEST(MacroAssemblerStackOverflow) {
1376  v8::V8::Initialize();
1377  ContextInitializer initializer;
1378  Isolate* isolate = CcTest::i_isolate();
1379  Factory* factory = isolate->factory();
1380  Zone zone(CcTest::i_isolate()->allocator());
1381
1382  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1383                             0);
1384
1385  Label loop;
1386  m.Bind(&loop);
1387  m.PushBacktrack(&loop);
1388  m.GoTo(&loop);
1389
1390  Handle<String> source =
1391      factory->NewStringFromStaticChars("<stack overflow test>");
1392  Handle<Object> code_object = m.GetCode(source);
1393  Handle<Code> code = Handle<Code>::cast(code_object);
1394
1395  // String long enough for test (content doesn't matter).
1396  Handle<String> input = factory->NewStringFromStaticChars("dummy");
1397  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1398  Address start_adr = seq_input->GetCharsAddress();
1399
1400  NativeRegExpMacroAssembler::Result result =
1401      Execute(*code,
1402              *input,
1403              0,
1404              start_adr,
1405              start_adr + input->length(),
1406              NULL);
1407
1408  CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1409  CHECK(isolate->has_pending_exception());
1410  isolate->clear_pending_exception();
1411}
1412
1413
1414TEST(MacroAssemblerNativeLotsOfRegisters) {
1415  v8::V8::Initialize();
1416  ContextInitializer initializer;
1417  Isolate* isolate = CcTest::i_isolate();
1418  Factory* factory = isolate->factory();
1419  Zone zone(CcTest::i_isolate()->allocator());
1420
1421  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1422                             2);
1423
1424  // At least 2048, to ensure the allocated space for registers
1425  // span one full page.
1426  const int large_number = 8000;
1427  m.WriteCurrentPositionToRegister(large_number, 42);
1428  m.WriteCurrentPositionToRegister(0, 0);
1429  m.WriteCurrentPositionToRegister(1, 1);
1430  Label done;
1431  m.CheckNotBackReference(0, false, &done);  // Performs a system-stack push.
1432  m.Bind(&done);
1433  m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
1434  m.PopRegister(1);
1435  m.Succeed();
1436
1437  Handle<String> source =
1438      factory->NewStringFromStaticChars("<huge register space test>");
1439  Handle<Object> code_object = m.GetCode(source);
1440  Handle<Code> code = Handle<Code>::cast(code_object);
1441
1442  // String long enough for test (content doesn't matter).
1443  Handle<String> input = factory->NewStringFromStaticChars("sample text");
1444  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1445  Address start_adr = seq_input->GetCharsAddress();
1446
1447  int captures[2];
1448  NativeRegExpMacroAssembler::Result result =
1449      Execute(*code,
1450              *input,
1451              0,
1452              start_adr,
1453              start_adr + input->length(),
1454              captures);
1455
1456  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1457  CHECK_EQ(0, captures[0]);
1458  CHECK_EQ(42, captures[1]);
1459
1460  isolate->clear_pending_exception();
1461}
1462
1463#else  // V8_INTERPRETED_REGEXP
1464
1465TEST(MacroAssembler) {
1466  byte codes[1024];
1467  Zone zone(CcTest::i_isolate()->allocator());
1468  RegExpMacroAssemblerIrregexp m(CcTest::i_isolate(), Vector<byte>(codes, 1024),
1469                                 &zone);
1470  // ^f(o)o.
1471  Label start, fail, backtrack;
1472
1473  m.SetRegister(4, 42);
1474  m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
1475  m.AdvanceRegister(4, 42);
1476  m.GoTo(&start);
1477  m.Fail();
1478  m.Bind(&start);
1479  m.PushBacktrack(&fail);
1480  m.CheckNotAtStart(0, NULL);
1481  m.LoadCurrentCharacter(0, NULL);
1482  m.CheckNotCharacter('f', NULL);
1483  m.LoadCurrentCharacter(1, NULL);
1484  m.CheckNotCharacter('o', NULL);
1485  m.LoadCurrentCharacter(2, NULL);
1486  m.CheckNotCharacter('o', NULL);
1487  m.WriteCurrentPositionToRegister(0, 0);
1488  m.WriteCurrentPositionToRegister(1, 3);
1489  m.WriteCurrentPositionToRegister(2, 1);
1490  m.WriteCurrentPositionToRegister(3, 2);
1491  m.AdvanceCurrentPosition(3);
1492  m.PushBacktrack(&backtrack);
1493  m.Succeed();
1494  m.Bind(&backtrack);
1495  m.ClearRegisters(2, 3);
1496  m.Backtrack();
1497  m.Bind(&fail);
1498  m.PopRegister(0);
1499  m.Fail();
1500
1501  Isolate* isolate = CcTest::i_isolate();
1502  Factory* factory = isolate->factory();
1503  HandleScope scope(isolate);
1504
1505  Handle<String> source = factory->NewStringFromStaticChars("^f(o)o");
1506  Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
1507  int captures[5];
1508
1509  const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1510  Handle<String> f1_16 = factory->NewStringFromTwoByte(
1511      Vector<const uc16>(str1, 6)).ToHandleChecked();
1512
1513  CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0));
1514  CHECK_EQ(0, captures[0]);
1515  CHECK_EQ(3, captures[1]);
1516  CHECK_EQ(1, captures[2]);
1517  CHECK_EQ(2, captures[3]);
1518  CHECK_EQ(84, captures[4]);
1519
1520  const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1521  Handle<String> f2_16 = factory->NewStringFromTwoByte(
1522      Vector<const uc16>(str2, 6)).ToHandleChecked();
1523
1524  CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0));
1525  CHECK_EQ(42, captures[0]);
1526}
1527
1528#endif  // V8_INTERPRETED_REGEXP
1529
1530
1531TEST(AddInverseToTable) {
1532  static const int kLimit = 1000;
1533  static const int kRangeCount = 16;
1534  for (int t = 0; t < 10; t++) {
1535    Zone zone(CcTest::i_isolate()->allocator());
1536    ZoneList<CharacterRange>* ranges =
1537        new(&zone) ZoneList<CharacterRange>(kRangeCount, &zone);
1538    for (int i = 0; i < kRangeCount; i++) {
1539      int from = PseudoRandom(t + 87, i + 25) % kLimit;
1540      int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
1541      if (to > kLimit) to = kLimit;
1542      ranges->Add(CharacterRange::Range(from, to), &zone);
1543    }
1544    DispatchTable table(&zone);
1545    DispatchTableConstructor cons(&table, false, &zone);
1546    cons.set_choice_index(0);
1547    cons.AddInverse(ranges);
1548    for (int i = 0; i < kLimit; i++) {
1549      bool is_on = false;
1550      for (int j = 0; !is_on && j < kRangeCount; j++)
1551        is_on = ranges->at(j).Contains(i);
1552      OutSet* set = table.Get(i);
1553      CHECK_EQ(is_on, set->Get(0) == false);
1554    }
1555  }
1556  Zone zone(CcTest::i_isolate()->allocator());
1557  ZoneList<CharacterRange>* ranges =
1558      new(&zone) ZoneList<CharacterRange>(1, &zone);
1559  ranges->Add(CharacterRange::Range(0xFFF0, 0xFFFE), &zone);
1560  DispatchTable table(&zone);
1561  DispatchTableConstructor cons(&table, false, &zone);
1562  cons.set_choice_index(0);
1563  cons.AddInverse(ranges);
1564  CHECK(!table.Get(0xFFFE)->Get(0));
1565  CHECK(table.Get(0xFFFF)->Get(0));
1566}
1567
1568
1569static uc32 canonicalize(uc32 c) {
1570  unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
1571  int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
1572  if (count == 0) {
1573    return c;
1574  } else {
1575    CHECK_EQ(1, count);
1576    return canon[0];
1577  }
1578}
1579
1580
1581TEST(LatinCanonicalize) {
1582  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1583  for (unibrow::uchar lower = 'a'; lower <= 'z'; lower++) {
1584    unibrow::uchar upper = lower + ('A' - 'a');
1585    CHECK_EQ(canonicalize(lower), canonicalize(upper));
1586    unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1587    int length = un_canonicalize.get(lower, '\0', uncanon);
1588    CHECK_EQ(2, length);
1589    CHECK_EQ(upper, uncanon[0]);
1590    CHECK_EQ(lower, uncanon[1]);
1591  }
1592  for (uc32 c = 128; c < (1 << 21); c++)
1593    CHECK_GE(canonicalize(c), 128);
1594  unibrow::Mapping<unibrow::ToUppercase> to_upper;
1595  // Canonicalization is only defined for the Basic Multilingual Plane.
1596  for (uc32 c = 0; c < (1 << 16); c++) {
1597    unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
1598    int length = to_upper.get(c, '\0', upper);
1599    if (length == 0) {
1600      length = 1;
1601      upper[0] = c;
1602    }
1603    uc32 u = upper[0];
1604    if (length > 1 || (c >= 128 && u < 128))
1605      u = c;
1606    CHECK_EQ(u, canonicalize(c));
1607  }
1608}
1609
1610
1611static uc32 CanonRangeEnd(uc32 c) {
1612  unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
1613  int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
1614  if (count == 0) {
1615    return c;
1616  } else {
1617    CHECK_EQ(1, count);
1618    return canon[0];
1619  }
1620}
1621
1622
1623TEST(RangeCanonicalization) {
1624  // Check that we arrive at the same result when using the basic
1625  // range canonicalization primitives as when using immediate
1626  // canonicalization.
1627  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1628  int block_start = 0;
1629  while (block_start <= 0xFFFF) {
1630    uc32 block_end = CanonRangeEnd(block_start);
1631    unsigned block_length = block_end - block_start + 1;
1632    if (block_length > 1) {
1633      unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1634      int first_length = un_canonicalize.get(block_start, '\0', first);
1635      for (unsigned i = 1; i < block_length; i++) {
1636        unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1637        int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
1638        CHECK_EQ(first_length, succ_length);
1639        for (int j = 0; j < succ_length; j++) {
1640          int calc = first[j] + i;
1641          int found = succ[j];
1642          CHECK_EQ(calc, found);
1643        }
1644      }
1645    }
1646    block_start = block_start + block_length;
1647  }
1648}
1649
1650
1651TEST(UncanonicalizeEquivalence) {
1652  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1653  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1654  for (int i = 0; i < (1 << 16); i++) {
1655    int length = un_canonicalize.get(i, '\0', chars);
1656    for (int j = 0; j < length; j++) {
1657      unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1658      int length2 = un_canonicalize.get(chars[j], '\0', chars2);
1659      CHECK_EQ(length, length2);
1660      for (int k = 0; k < length; k++)
1661        CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
1662    }
1663  }
1664}
1665
1666
1667static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input,
1668                                      Vector<CharacterRange> expected) {
1669  Zone zone(CcTest::i_isolate()->allocator());
1670  int count = expected.length();
1671  ZoneList<CharacterRange>* list =
1672      new(&zone) ZoneList<CharacterRange>(count, &zone);
1673  list->Add(input, &zone);
1674  CharacterRange::AddCaseEquivalents(isolate, &zone, list, false);
1675  list->Remove(0);  // Remove the input before checking results.
1676  CHECK_EQ(count, list->length());
1677  for (int i = 0; i < list->length(); i++) {
1678    CHECK_EQ(expected[i].from(), list->at(i).from());
1679    CHECK_EQ(expected[i].to(), list->at(i).to());
1680  }
1681}
1682
1683
1684static void TestSimpleRangeCaseIndependence(Isolate* isolate,
1685                                            CharacterRange input,
1686                                            CharacterRange expected) {
1687  EmbeddedVector<CharacterRange, 1> vector;
1688  vector[0] = expected;
1689  TestRangeCaseIndependence(isolate, input, vector);
1690}
1691
1692
1693TEST(CharacterRangeCaseIndependence) {
1694  Isolate* isolate = CcTest::i_isolate();
1695  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('a'),
1696                                  CharacterRange::Singleton('A'));
1697  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('z'),
1698                                  CharacterRange::Singleton('Z'));
1699  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('a', 'z'),
1700                                  CharacterRange::Range('A', 'Z'));
1701  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('c', 'f'),
1702                                  CharacterRange::Range('C', 'F'));
1703  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('a', 'b'),
1704                                  CharacterRange::Range('A', 'B'));
1705  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('y', 'z'),
1706                                  CharacterRange::Range('Y', 'Z'));
1707  TestSimpleRangeCaseIndependence(isolate,
1708                                  CharacterRange::Range('a' - 1, 'z' + 1),
1709                                  CharacterRange::Range('A', 'Z'));
1710  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('A', 'Z'),
1711                                  CharacterRange::Range('a', 'z'));
1712  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('C', 'F'),
1713                                  CharacterRange::Range('c', 'f'));
1714  TestSimpleRangeCaseIndependence(isolate,
1715                                  CharacterRange::Range('A' - 1, 'Z' + 1),
1716                                  CharacterRange::Range('a', 'z'));
1717  // Here we need to add [l-z] to complete the case independence of
1718  // [A-Za-z] but we expect [a-z] to be added since we always add a
1719  // whole block at a time.
1720  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('A', 'k'),
1721                                  CharacterRange::Range('a', 'z'));
1722}
1723
1724
1725static bool InClass(uc32 c, ZoneList<CharacterRange>* ranges) {
1726  if (ranges == NULL)
1727    return false;
1728  for (int i = 0; i < ranges->length(); i++) {
1729    CharacterRange range = ranges->at(i);
1730    if (range.from() <= c && c <= range.to())
1731      return true;
1732  }
1733  return false;
1734}
1735
1736
1737TEST(UnicodeRangeSplitter) {
1738  Zone zone(CcTest::i_isolate()->allocator());
1739  ZoneList<CharacterRange>* base =
1740      new(&zone) ZoneList<CharacterRange>(1, &zone);
1741  base->Add(CharacterRange::Everything(), &zone);
1742  UnicodeRangeSplitter splitter(&zone, base);
1743  // BMP
1744  for (uc32 c = 0; c < 0xd800; c++) {
1745    CHECK(InClass(c, splitter.bmp()));
1746    CHECK(!InClass(c, splitter.lead_surrogates()));
1747    CHECK(!InClass(c, splitter.trail_surrogates()));
1748    CHECK(!InClass(c, splitter.non_bmp()));
1749  }
1750  // Lead surrogates
1751  for (uc32 c = 0xd800; c < 0xdbff; c++) {
1752    CHECK(!InClass(c, splitter.bmp()));
1753    CHECK(InClass(c, splitter.lead_surrogates()));
1754    CHECK(!InClass(c, splitter.trail_surrogates()));
1755    CHECK(!InClass(c, splitter.non_bmp()));
1756  }
1757  // Trail surrogates
1758  for (uc32 c = 0xdc00; c < 0xdfff; c++) {
1759    CHECK(!InClass(c, splitter.bmp()));
1760    CHECK(!InClass(c, splitter.lead_surrogates()));
1761    CHECK(InClass(c, splitter.trail_surrogates()));
1762    CHECK(!InClass(c, splitter.non_bmp()));
1763  }
1764  // BMP
1765  for (uc32 c = 0xe000; c < 0xffff; c++) {
1766    CHECK(InClass(c, splitter.bmp()));
1767    CHECK(!InClass(c, splitter.lead_surrogates()));
1768    CHECK(!InClass(c, splitter.trail_surrogates()));
1769    CHECK(!InClass(c, splitter.non_bmp()));
1770  }
1771  // Non-BMP
1772  for (uc32 c = 0x10000; c < 0x10ffff; c++) {
1773    CHECK(!InClass(c, splitter.bmp()));
1774    CHECK(!InClass(c, splitter.lead_surrogates()));
1775    CHECK(!InClass(c, splitter.trail_surrogates()));
1776    CHECK(InClass(c, splitter.non_bmp()));
1777  }
1778}
1779
1780
1781TEST(CanonicalizeCharacterSets) {
1782  Zone zone(CcTest::i_isolate()->allocator());
1783  ZoneList<CharacterRange>* list =
1784      new(&zone) ZoneList<CharacterRange>(4, &zone);
1785  CharacterSet set(list);
1786
1787  list->Add(CharacterRange::Range(10, 20), &zone);
1788  list->Add(CharacterRange::Range(30, 40), &zone);
1789  list->Add(CharacterRange::Range(50, 60), &zone);
1790  set.Canonicalize();
1791  CHECK_EQ(3, list->length());
1792  CHECK_EQ(10, list->at(0).from());
1793  CHECK_EQ(20, list->at(0).to());
1794  CHECK_EQ(30, list->at(1).from());
1795  CHECK_EQ(40, list->at(1).to());
1796  CHECK_EQ(50, list->at(2).from());
1797  CHECK_EQ(60, list->at(2).to());
1798
1799  list->Rewind(0);
1800  list->Add(CharacterRange::Range(10, 20), &zone);
1801  list->Add(CharacterRange::Range(50, 60), &zone);
1802  list->Add(CharacterRange::Range(30, 40), &zone);
1803  set.Canonicalize();
1804  CHECK_EQ(3, list->length());
1805  CHECK_EQ(10, list->at(0).from());
1806  CHECK_EQ(20, list->at(0).to());
1807  CHECK_EQ(30, list->at(1).from());
1808  CHECK_EQ(40, list->at(1).to());
1809  CHECK_EQ(50, list->at(2).from());
1810  CHECK_EQ(60, list->at(2).to());
1811
1812  list->Rewind(0);
1813  list->Add(CharacterRange::Range(30, 40), &zone);
1814  list->Add(CharacterRange::Range(10, 20), &zone);
1815  list->Add(CharacterRange::Range(25, 25), &zone);
1816  list->Add(CharacterRange::Range(100, 100), &zone);
1817  list->Add(CharacterRange::Range(1, 1), &zone);
1818  set.Canonicalize();
1819  CHECK_EQ(5, list->length());
1820  CHECK_EQ(1, list->at(0).from());
1821  CHECK_EQ(1, list->at(0).to());
1822  CHECK_EQ(10, list->at(1).from());
1823  CHECK_EQ(20, list->at(1).to());
1824  CHECK_EQ(25, list->at(2).from());
1825  CHECK_EQ(25, list->at(2).to());
1826  CHECK_EQ(30, list->at(3).from());
1827  CHECK_EQ(40, list->at(3).to());
1828  CHECK_EQ(100, list->at(4).from());
1829  CHECK_EQ(100, list->at(4).to());
1830
1831  list->Rewind(0);
1832  list->Add(CharacterRange::Range(10, 19), &zone);
1833  list->Add(CharacterRange::Range(21, 30), &zone);
1834  list->Add(CharacterRange::Range(20, 20), &zone);
1835  set.Canonicalize();
1836  CHECK_EQ(1, list->length());
1837  CHECK_EQ(10, list->at(0).from());
1838  CHECK_EQ(30, list->at(0).to());
1839}
1840
1841
1842TEST(CharacterRangeMerge) {
1843  Zone zone(CcTest::i_isolate()->allocator());
1844  ZoneList<CharacterRange> l1(4, &zone);
1845  ZoneList<CharacterRange> l2(4, &zone);
1846  // Create all combinations of intersections of ranges, both singletons and
1847  // longer.
1848
1849  int offset = 0;
1850
1851  // The five kinds of singleton intersections:
1852  //     X
1853  //   Y      - outside before
1854  //    Y     - outside touching start
1855  //     Y    - overlap
1856  //      Y   - outside touching end
1857  //       Y  - outside after
1858
1859  for (int i = 0; i < 5; i++) {
1860    l1.Add(CharacterRange::Singleton(offset + 2), &zone);
1861    l2.Add(CharacterRange::Singleton(offset + i), &zone);
1862    offset += 6;
1863  }
1864
1865  // The seven kinds of singleton/non-singleton intersections:
1866  //    XXX
1867  //  Y        - outside before
1868  //   Y       - outside touching start
1869  //    Y      - inside touching start
1870  //     Y     - entirely inside
1871  //      Y    - inside touching end
1872  //       Y   - outside touching end
1873  //        Y  - disjoint after
1874
1875  for (int i = 0; i < 7; i++) {
1876    l1.Add(CharacterRange::Range(offset + 2, offset + 4), &zone);
1877    l2.Add(CharacterRange::Singleton(offset + i), &zone);
1878    offset += 8;
1879  }
1880
1881  // The eleven kinds of non-singleton intersections:
1882  //
1883  //       XXXXXXXX
1884  // YYYY                  - outside before.
1885  //   YYYY                - outside touching start.
1886  //     YYYY              - overlapping start
1887  //       YYYY            - inside touching start
1888  //         YYYY          - entirely inside
1889  //           YYYY        - inside touching end
1890  //             YYYY      - overlapping end
1891  //               YYYY    - outside touching end
1892  //                 YYYY  - outside after
1893  //       YYYYYYYY        - identical
1894  //     YYYYYYYYYYYY      - containing entirely.
1895
1896  for (int i = 0; i < 9; i++) {
1897    l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);  // Length 8.
1898    l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3), &zone);
1899    offset += 22;
1900  }
1901  l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1902  l2.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1903  offset += 22;
1904  l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1905  l2.Add(CharacterRange::Range(offset + 4, offset + 17), &zone);
1906  offset += 22;
1907
1908  // Different kinds of multi-range overlap:
1909  // XXXXXXXXXXXXXXXXXXXXXX         XXXXXXXXXXXXXXXXXXXXXX
1910  //   YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y
1911
1912  l1.Add(CharacterRange::Range(offset, offset + 21), &zone);
1913  l1.Add(CharacterRange::Range(offset + 31, offset + 52), &zone);
1914  for (int i = 0; i < 6; i++) {
1915    l2.Add(CharacterRange::Range(offset + 2, offset + 5), &zone);
1916    l2.Add(CharacterRange::Singleton(offset + 8), &zone);
1917    offset += 9;
1918  }
1919
1920  CHECK(CharacterRange::IsCanonical(&l1));
1921  CHECK(CharacterRange::IsCanonical(&l2));
1922
1923  ZoneList<CharacterRange> first_only(4, &zone);
1924  ZoneList<CharacterRange> second_only(4, &zone);
1925  ZoneList<CharacterRange> both(4, &zone);
1926}
1927
1928
1929TEST(Graph) {
1930  Execute("\\b\\w+\\b", false, true, true);
1931}
1932
1933
1934namespace {
1935
1936int* global_use_counts = NULL;
1937
1938void MockUseCounterCallback(v8::Isolate* isolate,
1939                            v8::Isolate::UseCounterFeature feature) {
1940  ++global_use_counts[feature];
1941}
1942}
1943
1944
1945// Test that ES2015 RegExp compatibility fixes are in place, that they
1946// are not overly broad, and the appropriate UseCounters are incremented
1947TEST(UseCountRegExp) {
1948  v8::Isolate* isolate = CcTest::isolate();
1949  v8::HandleScope scope(isolate);
1950  LocalContext env;
1951  int use_counts[v8::Isolate::kUseCounterFeatureCount] = {};
1952  global_use_counts = use_counts;
1953  CcTest::isolate()->SetUseCounterCallback(MockUseCounterCallback);
1954
1955  // Compat fix: RegExp.prototype.sticky == undefined; UseCounter tracks it
1956  v8::Local<v8::Value> resultSticky = CompileRun("RegExp.prototype.sticky");
1957  CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1958  CHECK_EQ(0, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1959  CHECK(resultSticky->IsUndefined());
1960
1961  // re.sticky has approriate value and doesn't touch UseCounter
1962  v8::Local<v8::Value> resultReSticky = CompileRun("/a/.sticky");
1963  CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1964  CHECK_EQ(0, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1965  CHECK(resultReSticky->IsFalse());
1966
1967  // When the getter is caleld on another object, throw an exception
1968  // and don't increment the UseCounter
1969  v8::Local<v8::Value> resultStickyError = CompileRun(
1970      "var exception;"
1971      "try { "
1972      "  Object.getOwnPropertyDescriptor(RegExp.prototype, 'sticky')"
1973      "      .get.call(null);"
1974      "} catch (e) {"
1975      "  exception = e;"
1976      "}"
1977      "exception");
1978  CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1979  CHECK_EQ(0, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1980  CHECK(resultStickyError->IsObject());
1981
1982  // RegExp.prototype.toString() returns '/(?:)/' as a compatibility fix;
1983  // a UseCounter is incremented to track it.
1984  v8::Local<v8::Value> resultToString =
1985      CompileRun("RegExp.prototype.toString().length");
1986  CHECK_EQ(2, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1987  CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1988  CHECK(resultToString->IsInt32());
1989  CHECK_EQ(6,
1990           resultToString->Int32Value(isolate->GetCurrentContext()).FromJust());
1991
1992  // .toString() works on normal RegExps
1993  v8::Local<v8::Value> resultReToString = CompileRun("/a/.toString().length");
1994  CHECK_EQ(2, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1995  CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1996  CHECK(resultReToString->IsInt32());
1997  CHECK_EQ(
1998      3, resultReToString->Int32Value(isolate->GetCurrentContext()).FromJust());
1999
2000  // .toString() throws on non-RegExps that aren't RegExp.prototype
2001  v8::Local<v8::Value> resultToStringError = CompileRun(
2002      "var exception;"
2003      "try { RegExp.prototype.toString.call(null) }"
2004      "catch (e) { exception = e; }"
2005      "exception");
2006  CHECK_EQ(2, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
2007  CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeToString]);
2008  CHECK(resultToStringError->IsObject());
2009}
2010
2011class UncachedExternalString
2012    : public v8::String::ExternalOneByteStringResource {
2013 public:
2014  const char* data() const override { return "abcdefghijklmnopqrstuvwxyz"; }
2015  size_t length() const override { return 26; }
2016  bool IsCompressible() const override { return true; }
2017};
2018
2019TEST(UncachedExternalString) {
2020  v8::Isolate* isolate = CcTest::isolate();
2021  v8::HandleScope scope(isolate);
2022  LocalContext env;
2023  v8::Local<v8::String> external =
2024      v8::String::NewExternalOneByte(isolate, new UncachedExternalString())
2025          .ToLocalChecked();
2026  CHECK(v8::Utils::OpenHandle(*external)->map() ==
2027        CcTest::i_isolate()->heap()->short_external_one_byte_string_map());
2028  v8::Local<v8::Object> global = env->Global();
2029  global->Set(env.local(), v8_str("external"), external).FromJust();
2030  CompileRun("var re = /y(.)/; re.test('ab');");
2031  ExpectString("external.substring(1).match(re)[1]", "z");
2032}
2033