1// -*- coding: utf-8 -*-
2// Copyright 2002-2009 The RE2 Authors.  All Rights Reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// TODO: Test extractions for PartialMatch/Consume
7
8#include <sys/types.h>
9#include <sys/mman.h>
10#include <sys/stat.h>
11#include <errno.h>
12#include <vector>
13#include "util/test.h"
14#include "re2/re2.h"
15#include "re2/regexp.h"
16
17DECLARE_bool(logtostderr);
18
19namespace re2 {
20
21TEST(RE2, HexTests) {
22
23  VLOG(1) << "hex tests";
24
25#define CHECK_HEX(type, value) \
26  do { \
27    type v; \
28    CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
29    CHECK_EQ(v, 0x ## value); \
30    CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
31    CHECK_EQ(v, 0x ## value); \
32  } while(0)
33
34  CHECK_HEX(short,              2bad);
35  CHECK_HEX(unsigned short,     2badU);
36  CHECK_HEX(int,                dead);
37  CHECK_HEX(unsigned int,       deadU);
38  CHECK_HEX(long,               7eadbeefL);
39  CHECK_HEX(unsigned long,      deadbeefUL);
40  CHECK_HEX(long long,          12345678deadbeefLL);
41  CHECK_HEX(unsigned long long, cafebabedeadbeefULL);
42
43#undef CHECK_HEX
44}
45
46TEST(RE2, OctalTests) {
47  VLOG(1) << "octal tests";
48
49#define CHECK_OCTAL(type, value) \
50  do { \
51    type v; \
52    CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
53    CHECK_EQ(v, 0 ## value); \
54    CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
55    CHECK_EQ(v, 0 ## value); \
56  } while(0)
57
58  CHECK_OCTAL(short,              77777);
59  CHECK_OCTAL(unsigned short,     177777U);
60  CHECK_OCTAL(int,                17777777777);
61  CHECK_OCTAL(unsigned int,       37777777777U);
62  CHECK_OCTAL(long,               17777777777L);
63  CHECK_OCTAL(unsigned long,      37777777777UL);
64  CHECK_OCTAL(long long,          777777777777777777777LL);
65  CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL);
66
67#undef CHECK_OCTAL
68}
69
70TEST(RE2, DecimalTests) {
71  VLOG(1) << "decimal tests";
72
73#define CHECK_DECIMAL(type, value) \
74  do { \
75    type v; \
76    CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
77    CHECK_EQ(v, value); \
78    CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
79    CHECK_EQ(v, value); \
80  } while(0)
81
82  CHECK_DECIMAL(short,              -1);
83  CHECK_DECIMAL(unsigned short,     9999);
84  CHECK_DECIMAL(int,                -1000);
85  CHECK_DECIMAL(unsigned int,       12345U);
86  CHECK_DECIMAL(long,               -10000000L);
87  CHECK_DECIMAL(unsigned long,      3083324652U);
88  CHECK_DECIMAL(long long,          -100000000000000LL);
89  CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL);
90
91#undef CHECK_DECIMAL
92}
93
94TEST(RE2, Replace) {
95  VLOG(1) << "TestReplace";
96
97  struct ReplaceTest {
98    const char *regexp;
99    const char *rewrite;
100    const char *original;
101    const char *single;
102    const char *global;
103    int        greplace_count;
104  };
105  static const ReplaceTest tests[] = {
106    { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
107      "\\2\\1ay",
108      "the quick brown fox jumps over the lazy dogs.",
109      "ethay quick brown fox jumps over the lazy dogs.",
110      "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
111      9 },
112    { "\\w+",
113      "\\0-NOSPAM",
114      "abcd.efghi@google.com",
115      "abcd-NOSPAM.efghi@google.com",
116      "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM",
117      4 },
118    { "^",
119      "(START)",
120      "foo",
121      "(START)foo",
122      "(START)foo",
123      1 },
124    { "^",
125      "(START)",
126      "",
127      "(START)",
128      "(START)",
129      1 },
130    { "$",
131      "(END)",
132      "",
133      "(END)",
134      "(END)",
135      1 },
136    { "b",
137      "bb",
138      "ababababab",
139      "abbabababab",
140      "abbabbabbabbabb",
141      5 },
142    { "b",
143      "bb",
144      "bbbbbb",
145      "bbbbbbb",
146      "bbbbbbbbbbbb",
147      6 },
148    { "b+",
149      "bb",
150      "bbbbbb",
151      "bb",
152      "bb",
153      1 },
154    { "b*",
155      "bb",
156      "bbbbbb",
157      "bb",
158      "bb",
159      1 },
160    { "b*",
161      "bb",
162      "aaaaa",
163      "bbaaaaa",
164      "bbabbabbabbabbabb",
165      6 },
166    // Check newline handling
167    { "a.*a",
168      "(\\0)",
169      "aba\naba",
170      "(aba)\naba",
171      "(aba)\n(aba)",
172      2 },
173    { "", NULL, NULL, NULL, NULL, 0 }
174  };
175
176  for (const ReplaceTest *t = tests; t->original != NULL; ++t) {
177    VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite);
178    string one(t->original);
179    CHECK(RE2::Replace(&one, t->regexp, t->rewrite));
180    CHECK_EQ(one, t->single);
181    string all(t->original);
182    CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
183      << "Got: " << all;
184    CHECK_EQ(all, t->global);
185  }
186}
187
188static void TestCheckRewriteString(const char* regexp, const char* rewrite,
189                              bool expect_ok) {
190  string error;
191  RE2 exp(regexp);
192  bool actual_ok = exp.CheckRewriteString(rewrite, &error);
193  EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
194}
195
196TEST(CheckRewriteString, all) {
197  TestCheckRewriteString("abc", "foo", true);
198  TestCheckRewriteString("abc", "foo\\", false);
199  TestCheckRewriteString("abc", "foo\\0bar", true);
200
201  TestCheckRewriteString("a(b)c", "foo", true);
202  TestCheckRewriteString("a(b)c", "foo\\0bar", true);
203  TestCheckRewriteString("a(b)c", "foo\\1bar", true);
204  TestCheckRewriteString("a(b)c", "foo\\2bar", false);
205  TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
206
207  TestCheckRewriteString("a(b)(c)", "foo\\12", true);
208  TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
209  TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
210}
211
212TEST(RE2, Extract) {
213  VLOG(1) << "TestExtract";
214
215  string s;
216
217  CHECK(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s));
218  CHECK_EQ(s, "kremvax!boris");
219
220  CHECK(RE2::Extract("foo", ".*", "'\\0'", &s));
221  CHECK_EQ(s, "'foo'");
222  // check that false match doesn't overwrite
223  CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s));
224  CHECK_EQ(s, "'foo'");
225}
226
227TEST(RE2, Consume) {
228  VLOG(1) << "TestConsume";
229
230  RE2 r("\\s*(\\w+)");    // matches a word, possibly proceeded by whitespace
231  string word;
232
233  string s("   aaa b!@#$@#$cccc");
234  StringPiece input(s);
235
236  CHECK(RE2::Consume(&input, r, &word));
237  CHECK_EQ(word, "aaa") << " input: " << input;
238  CHECK(RE2::Consume(&input, r, &word));
239  CHECK_EQ(word, "b") << " input: " << input;
240  CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input;
241}
242
243TEST(RE2, ConsumeN) {
244  const string s(" one two three 4");
245  StringPiece input(s);
246
247  RE2::Arg argv[2];
248  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
249
250  // 0 arg
251  EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0));  // Skips "one".
252
253  // 1 arg
254  string word;
255  argv[0] = &word;
256  EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
257  EXPECT_EQ("two", word);
258
259  // Multi-args
260  int n;
261  argv[1] = &n;
262  EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
263  EXPECT_EQ("three", word);
264  EXPECT_EQ(4, n);
265}
266
267TEST(RE2, FindAndConsume) {
268  VLOG(1) << "TestFindAndConsume";
269
270  RE2 r("(\\w+)");      // matches a word
271  string word;
272
273  string s("   aaa b!@#$@#$cccc");
274  StringPiece input(s);
275
276  CHECK(RE2::FindAndConsume(&input, r, &word));
277  CHECK_EQ(word, "aaa");
278  CHECK(RE2::FindAndConsume(&input, r, &word));
279  CHECK_EQ(word, "b");
280  CHECK(RE2::FindAndConsume(&input, r, &word));
281  CHECK_EQ(word, "cccc");
282  CHECK(! RE2::FindAndConsume(&input, r, &word));
283
284  // Check that FindAndConsume works without any submatches.
285  // Earlier version used uninitialized data for
286  // length to consume.
287  input = "aaa";
288  CHECK(RE2::FindAndConsume(&input, "aaa"));
289  CHECK_EQ(input, "");
290}
291
292TEST(RE2, FindAndConsumeN) {
293  const string s(" one two three 4");
294  StringPiece input(s);
295
296  RE2::Arg argv[2];
297  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
298
299  // 0 arg
300  EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0));  // Skips "one".
301
302  // 1 arg
303  string word;
304  argv[0] = &word;
305  EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
306  EXPECT_EQ("two", word);
307
308  // Multi-args
309  int n;
310  argv[1] = &n;
311  EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
312  EXPECT_EQ("three", word);
313  EXPECT_EQ(4, n);
314}
315
316TEST(RE2, MatchNumberPeculiarity) {
317  VLOG(1) << "TestMatchNumberPeculiarity";
318
319  RE2 r("(foo)|(bar)|(baz)");
320  string word1;
321  string word2;
322  string word3;
323
324  CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
325  CHECK_EQ(word1, "foo");
326  CHECK_EQ(word2, "");
327  CHECK_EQ(word3, "");
328  CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
329  CHECK_EQ(word1, "");
330  CHECK_EQ(word2, "bar");
331  CHECK_EQ(word3, "");
332  CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
333  CHECK_EQ(word1, "");
334  CHECK_EQ(word2, "");
335  CHECK_EQ(word3, "baz");
336  CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3));
337
338  string a;
339  CHECK(RE2::FullMatch("hello", "(foo)|hello", &a));
340  CHECK_EQ(a, "");
341}
342
343TEST(RE2, Match) {
344  RE2 re("((\\w+):([0-9]+))");   // extracts host and port
345  StringPiece group[4];
346
347  // No match.
348  StringPiece s = "zyzzyva";
349  CHECK(!re.Match(s, 0, s.size(), RE2::UNANCHORED,
350                  group, arraysize(group)));
351
352  // Matches and extracts.
353  s = "a chrisr:9000 here";
354  CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED,
355                 group, arraysize(group)));
356  CHECK_EQ(group[0], "chrisr:9000");
357  CHECK_EQ(group[1], "chrisr:9000");
358  CHECK_EQ(group[2], "chrisr");
359  CHECK_EQ(group[3], "9000");
360
361  string all, host;
362  int port;
363  CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
364  CHECK_EQ(all, "chrisr:9000");
365  CHECK_EQ(host, "chrisr");
366  CHECK_EQ(port, 9000);
367}
368
369static void TestRecursion(int size, const char *pattern) {
370  // Fill up a string repeating the pattern given
371  string domain;
372  domain.resize(size);
373  int patlen = strlen(pattern);
374  for (int i = 0; i < size; ++i) {
375    domain[i] = pattern[i % patlen];
376  }
377  // Just make sure it doesn't crash due to too much recursion.
378  RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
379  RE2::FullMatch(domain, re);
380}
381
382// A meta-quoted string, interpreted as a pattern, should always match
383// the original unquoted string.
384static void TestQuoteMeta(string unquoted,
385                          const RE2::Options& options = RE2::DefaultOptions) {
386  string quoted = RE2::QuoteMeta(unquoted);
387  RE2 re(quoted, options);
388  EXPECT_TRUE_M(RE2::FullMatch(unquoted, re),
389                "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
390}
391
392// A meta-quoted string, interpreted as a pattern, should always match
393// the original unquoted string.
394static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
395                                  const RE2::Options& options = RE2::DefaultOptions) {
396  string quoted = RE2::QuoteMeta(unquoted);
397  RE2 re(quoted, options);
398  EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re),
399                 "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
400}
401
402// Tests that quoted meta characters match their original strings,
403// and that a few things that shouldn't match indeed do not.
404TEST(QuoteMeta, Simple) {
405  TestQuoteMeta("foo");
406  TestQuoteMeta("foo.bar");
407  TestQuoteMeta("foo\\.bar");
408  TestQuoteMeta("[1-9]");
409  TestQuoteMeta("1.5-2.0?");
410  TestQuoteMeta("\\d");
411  TestQuoteMeta("Who doesn't like ice cream?");
412  TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
413  TestQuoteMeta("((?!)xxx).*yyy");
414  TestQuoteMeta("([");
415}
416TEST(QuoteMeta, SimpleNegative) {
417  NegativeTestQuoteMeta("foo", "bar");
418  NegativeTestQuoteMeta("...", "bar");
419  NegativeTestQuoteMeta("\\.", ".");
420  NegativeTestQuoteMeta("\\.", "..");
421  NegativeTestQuoteMeta("(a)", "a");
422  NegativeTestQuoteMeta("(a|b)", "a");
423  NegativeTestQuoteMeta("(a|b)", "(a)");
424  NegativeTestQuoteMeta("(a|b)", "a|b");
425  NegativeTestQuoteMeta("[0-9]", "0");
426  NegativeTestQuoteMeta("[0-9]", "0-9");
427  NegativeTestQuoteMeta("[0-9]", "[9]");
428  NegativeTestQuoteMeta("((?!)xxx)", "xxx");
429}
430
431TEST(QuoteMeta, Latin1) {
432  TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
433}
434
435TEST(QuoteMeta, UTF8) {
436  TestQuoteMeta("Plácido Domingo");
437  TestQuoteMeta("xyz");  // No fancy utf8.
438  TestQuoteMeta("\xc2\xb0");  // 2-byte utf8 -- a degree symbol.
439  TestQuoteMeta("27\xc2\xb0 degrees");  // As a middle character.
440  TestQuoteMeta("\xe2\x80\xb3");  // 3-byte utf8 -- a double prime.
441  TestQuoteMeta("\xf0\x9d\x85\x9f");  // 4-byte utf8 -- a music note.
442  TestQuoteMeta("27\xc2\xb0");  // Interpreted as Latin-1, this should
443                                // still work.
444  NegativeTestQuoteMeta("27\xc2\xb0",
445                        "27\\\xc2\\\xb0");  // 2-byte utf8 -- a degree symbol.
446}
447
448TEST(QuoteMeta, HasNull) {
449  string has_null;
450
451  // string with one null character
452  has_null += '\0';
453  TestQuoteMeta(has_null);
454  NegativeTestQuoteMeta(has_null, "");
455
456  // Don't want null-followed-by-'1' to be interpreted as '\01'.
457  has_null += '1';
458  TestQuoteMeta(has_null);
459  NegativeTestQuoteMeta(has_null, "\1");
460}
461
462TEST(ProgramSize, BigProgram) {
463  RE2 re_simple("simple regexp");
464  RE2 re_medium("medium.*regexp");
465  RE2 re_complex("hard.{1,128}regexp");
466
467  CHECK_GT(re_simple.ProgramSize(), 0);
468  CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
469  CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
470}
471
472// Issue 956519: handling empty character sets was
473// causing NULL dereference.  This tests a few empty character sets.
474// (The way to get an empty character set is to negate a full one.)
475TEST(EmptyCharset, Fuzz) {
476  static const char *empties[] = {
477    "[^\\S\\s]",
478    "[^\\S[:space:]]",
479    "[^\\D\\d]",
480    "[^\\D[:digit:]]"
481  };
482  for (int i = 0; i < arraysize(empties); i++)
483    CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
484}
485
486// Test that named groups work correctly.
487TEST(Capture, NamedGroups) {
488  {
489    RE2 re("(hello world)");
490    CHECK_EQ(re.NumberOfCapturingGroups(), 1);
491    const map<string, int>& m = re.NamedCapturingGroups();
492    CHECK_EQ(m.size(), 0);
493  }
494
495  {
496    RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
497    CHECK_EQ(re.NumberOfCapturingGroups(), 6);
498    const map<string, int>& m = re.NamedCapturingGroups();
499    CHECK_EQ(m.size(), 4);
500    CHECK_EQ(m.find("A")->second, 1);
501    CHECK_EQ(m.find("B")->second, 2);
502    CHECK_EQ(m.find("C")->second, 3);
503    CHECK_EQ(m.find("D")->second, 6);  // $4 and $5 are anonymous
504  }
505}
506
507TEST(RE2, FullMatchWithNoArgs) {
508  CHECK(RE2::FullMatch("h", "h"));
509  CHECK(RE2::FullMatch("hello", "hello"));
510  CHECK(RE2::FullMatch("hello", "h.*o"));
511  CHECK(!RE2::FullMatch("othello", "h.*o"));       // Must be anchored at front
512  CHECK(!RE2::FullMatch("hello!", "h.*o"));        // Must be anchored at end
513}
514
515TEST(RE2, PartialMatch) {
516  CHECK(RE2::PartialMatch("x", "x"));
517  CHECK(RE2::PartialMatch("hello", "h.*o"));
518  CHECK(RE2::PartialMatch("othello", "h.*o"));
519  CHECK(RE2::PartialMatch("hello!", "h.*o"));
520  CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
521}
522
523TEST(RE2, PartialMatchN) {
524  RE2::Arg argv[2];
525  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
526
527  // 0 arg
528  EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
529  EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
530
531  // 1 arg
532  int i;
533  argv[0] = &i;
534  EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
535  EXPECT_EQ(1001, i);
536  EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
537
538  // Multi-arg
539  string s;
540  argv[1] = &s;
541  EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
542  EXPECT_EQ(42, i);
543  EXPECT_EQ("life", s);
544  EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
545}
546
547TEST(RE2, FullMatchZeroArg) {
548  // Zero-arg
549  CHECK(RE2::FullMatch("1001", "\\d+"));
550}
551
552TEST(RE2, FullMatchOneArg) {
553  int i;
554
555  // Single-arg
556  CHECK(RE2::FullMatch("1001", "(\\d+)",   &i));
557  CHECK_EQ(i, 1001);
558  CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i));
559  CHECK_EQ(i, -123);
560  CHECK(!RE2::FullMatch("10", "()\\d+", &i));
561  CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890",
562                       "(\\d+)", &i));
563}
564
565TEST(RE2, FullMatchIntegerArg) {
566  int i;
567
568  // Digits surrounding integer-arg
569  CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i));
570  CHECK_EQ(i, 23);
571  CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i));
572  CHECK_EQ(i, 1);
573  CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
574  CHECK_EQ(i, -1);
575  CHECK(RE2::PartialMatch("1234", "(\\d)", &i));
576  CHECK_EQ(i, 1);
577  CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i));
578  CHECK_EQ(i, -1);
579}
580
581TEST(RE2, FullMatchStringArg) {
582  string s;
583  // String-arg
584  CHECK(RE2::FullMatch("hello", "h(.*)o", &s));
585  CHECK_EQ(s, string("ell"));
586}
587
588TEST(RE2, FullMatchStringPieceArg) {
589  int i;
590  // StringPiece-arg
591  StringPiece sp;
592  CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
593  CHECK_EQ(sp.size(), 4);
594  CHECK(memcmp(sp.data(), "ruby", 4) == 0);
595  CHECK_EQ(i, 1234);
596}
597
598TEST(RE2, FullMatchMultiArg) {
599  int i;
600  string s;
601  // Multi-arg
602  CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
603  CHECK_EQ(s, string("ruby"));
604  CHECK_EQ(i, 1234);
605}
606
607TEST(RE2, FullMatchN) {
608  RE2::Arg argv[2];
609  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
610
611  // 0 arg
612  EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
613  EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
614
615  // 1 arg
616  int i;
617  argv[0] = &i;
618  EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
619  EXPECT_EQ(1001, i);
620  EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
621
622  // Multi-arg
623  string s;
624  argv[1] = &s;
625  EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
626  EXPECT_EQ(42, i);
627  EXPECT_EQ("life", s);
628  EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
629}
630
631TEST(RE2, FullMatchIgnoredArg) {
632  int i;
633  string s;
634  // Ignored arg
635  CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
636  CHECK_EQ(s, string("ruby"));
637  CHECK_EQ(i, 1234);
638}
639
640TEST(RE2, FullMatchTypedNullArg) {
641  string s;
642
643  // Ignore non-void* NULL arg
644  CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
645  CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL));
646  CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
647  CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL));
648  CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
649  CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
650  CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
651
652  // Fail on non-void* NULL arg if the match doesn't parse for the given type.
653  CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
654  CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL));
655  CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
656  CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL));
657  CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL));
658}
659
660// Check that numeric parsing code does not read past the end of
661// the number being parsed.
662TEST(RE2, NULTerminated) {
663  char *v;
664  int x;
665  long pagesize = sysconf(_SC_PAGE_SIZE);
666
667#ifndef MAP_ANONYMOUS
668#define MAP_ANONYMOUS MAP_ANON
669#endif
670  v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
671                              MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
672  CHECK(v != reinterpret_cast<char*>(-1));
673  LOG(INFO) << "Memory at " << (void*)v;
674  CHECK_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
675  v[pagesize - 1] = '1';
676
677  x = 0;
678  CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x));
679  CHECK_EQ(x, 1);
680}
681
682TEST(RE2, FullMatchTypeTests) {
683  // Type tests
684  string zeros(100, '0');
685  {
686    char c;
687    CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
688    CHECK_EQ(c, 'H');
689  }
690  {
691    unsigned char c;
692    CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
693    CHECK_EQ(c, static_cast<unsigned char>('H'));
694  }
695  {
696    int16 v;
697    CHECK(RE2::FullMatch("100",     "(-?\\d+)", &v));    CHECK_EQ(v, 100);
698    CHECK(RE2::FullMatch("-100",    "(-?\\d+)", &v));    CHECK_EQ(v, -100);
699    CHECK(RE2::FullMatch("32767",   "(-?\\d+)", &v));    CHECK_EQ(v, 32767);
700    CHECK(RE2::FullMatch("-32768",  "(-?\\d+)", &v));    CHECK_EQ(v, -32768);
701    CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v));
702    CHECK(!RE2::FullMatch("32768",  "(-?\\d+)", &v));
703  }
704  {
705    uint16 v;
706    CHECK(RE2::FullMatch("100",     "(\\d+)", &v));    CHECK_EQ(v, 100);
707    CHECK(RE2::FullMatch("32767",   "(\\d+)", &v));    CHECK_EQ(v, 32767);
708    CHECK(RE2::FullMatch("65535",   "(\\d+)", &v));    CHECK_EQ(v, 65535);
709    CHECK(!RE2::FullMatch("65536",  "(\\d+)", &v));
710  }
711  {
712    int32 v;
713    static const int32 max = 0x7fffffff;
714    static const int32 min = -max - 1;
715    CHECK(RE2::FullMatch("100",          "(-?\\d+)", &v)); CHECK_EQ(v, 100);
716    CHECK(RE2::FullMatch("-100",         "(-?\\d+)", &v)); CHECK_EQ(v, -100);
717    CHECK(RE2::FullMatch("2147483647",   "(-?\\d+)", &v)); CHECK_EQ(v, max);
718    CHECK(RE2::FullMatch("-2147483648",  "(-?\\d+)", &v)); CHECK_EQ(v, min);
719    CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
720    CHECK(!RE2::FullMatch("2147483648",  "(-?\\d+)", &v));
721
722    CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
723    CHECK_EQ(v, max);
724    CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
725    CHECK_EQ(v, min);
726
727    CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
728    CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
729    CHECK_EQ(v, max);
730    CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
731  }
732  {
733    uint32 v;
734    static const uint32 max = 0xfffffffful;
735    CHECK(RE2::FullMatch("100",         "(\\d+)", &v)); CHECK_EQ(v, 100);
736    CHECK(RE2::FullMatch("4294967295",  "(\\d+)", &v)); CHECK_EQ(v, max);
737    CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v));
738    CHECK(!RE2::FullMatch("-1",         "(\\d+)", &v));
739
740    CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
741  }
742  {
743    int64 v;
744    static const int64 max = 0x7fffffffffffffffull;
745    static const int64 min = -max - 1;
746    char buf[32];
747
748    CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v)); CHECK_EQ(v, 100);
749    CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
750
751    snprintf(buf, sizeof(buf), "%lld", (long long int)max);
752    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
753
754    snprintf(buf, sizeof(buf), "%lld", (long long int)min);
755    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, min);
756
757    snprintf(buf, sizeof(buf), "%lld", (long long int)max);
758    assert(buf[strlen(buf)-1] != '9');
759    buf[strlen(buf)-1]++;
760    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
761
762    snprintf(buf, sizeof(buf), "%lld", (long long int)min);
763    assert(buf[strlen(buf)-1] != '9');
764    buf[strlen(buf)-1]++;
765    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
766  }
767  {
768    uint64 v;
769    int64 v2;
770    static const uint64 max = 0xffffffffffffffffull;
771    char buf[32];
772
773    CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v));  CHECK_EQ(v, 100);
774    CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100);
775
776    snprintf(buf, sizeof(buf), "%llu", (long long unsigned)max);
777    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
778
779    assert(buf[strlen(buf)-1] != '9');
780    buf[strlen(buf)-1]++;
781    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
782  }
783}
784
785TEST(RE2, FloatingPointFullMatchTypes) {
786  string zeros(100, '0');
787  {
788    float v;
789    CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
790    CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
791    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, float(1e23));
792
793    CHECK(RE2::FullMatch(zeros + "1e23",  "(.*)", &v));
794    CHECK_EQ(v, float(1e23));
795
796    // 6700000000081920.1 is an edge case.
797    // 6700000000081920 is exactly halfway between
798    // two float32s, so the .1 should make it round up.
799    // However, the .1 is outside the precision possible with
800    // a float64: the nearest float64 is 6700000000081920.
801    // So if the code uses strtod and then converts to float32,
802    // round-to-even will make it round down instead of up.
803    // To pass the test, the parser must call strtof directly.
804    // This test case is carefully chosen to use only a 17-digit
805    // number, since C does not guarantee to get the correctly
806    // rounded answer for strtod and strtof unless the input is
807    // short.
808    CHECK(RE2::FullMatch("0.1", "(.*)", &v));
809    CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
810    CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
811    CHECK_EQ(v, 6700000000081920.1f)
812      << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
813  }
814  {
815    double v;
816    CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
817    CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
818    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, 1e23);
819    CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
820    CHECK_EQ(v, double(1e23));
821
822    CHECK(RE2::FullMatch("0.1", "(.*)", &v));
823    CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
824    CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
825    CHECK_EQ(v, 1.0000000596046448)
826      << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
827  }
828}
829
830TEST(RE2, FullMatchAnchored) {
831  int i;
832  // Check that matching is fully anchored
833  CHECK(!RE2::FullMatch("x1001", "(\\d+)",  &i));
834  CHECK(!RE2::FullMatch("1001x", "(\\d+)",  &i));
835  CHECK(RE2::FullMatch("x1001",  "x(\\d+)", &i)); CHECK_EQ(i, 1001);
836  CHECK(RE2::FullMatch("1001x",  "(\\d+)x", &i)); CHECK_EQ(i, 1001);
837}
838
839TEST(RE2, FullMatchBraces) {
840  // Braces
841  CHECK(RE2::FullMatch("0abcd",  "[0-9a-f+.-]{5,}"));
842  CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
843  CHECK(!RE2::FullMatch("0abc",  "[0-9a-f+.-]{5,}"));
844}
845
846TEST(RE2, Complicated) {
847  // Complicated RE2
848  CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
849  CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
850  CHECK(RE2::FullMatch("X",   "foo|bar|[A-Z]"));
851  CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]"));
852}
853
854TEST(RE2, FullMatchEnd) {
855  // Check full-match handling (needs '$' tacked on internally)
856  CHECK(RE2::FullMatch("fo", "fo|foo"));
857  CHECK(RE2::FullMatch("foo", "fo|foo"));
858  CHECK(RE2::FullMatch("fo", "fo|foo$"));
859  CHECK(RE2::FullMatch("foo", "fo|foo$"));
860  CHECK(RE2::FullMatch("foo", "foo$"));
861  CHECK(!RE2::FullMatch("foo$bar", "foo\\$"));
862  CHECK(!RE2::FullMatch("fox", "fo|bar"));
863
864  // Uncomment the following if we change the handling of '$' to
865  // prevent it from matching a trailing newline
866  if (false) {
867    // Check that we don't get bitten by pcre's special handling of a
868    // '\n' at the end of the string matching '$'
869    CHECK(!RE2::PartialMatch("foo\n", "foo$"));
870  }
871}
872
873TEST(RE2, FullMatchArgCount) {
874  // Number of args
875  int a[16];
876  CHECK(RE2::FullMatch("", ""));
877
878  memset(a, 0, sizeof(0));
879  CHECK(RE2::FullMatch("1",
880                      "(\\d){1}",
881                      &a[0]));
882  CHECK_EQ(a[0], 1);
883
884  memset(a, 0, sizeof(0));
885  CHECK(RE2::FullMatch("12",
886                      "(\\d)(\\d)",
887                      &a[0],  &a[1]));
888  CHECK_EQ(a[0], 1);
889  CHECK_EQ(a[1], 2);
890
891  memset(a, 0, sizeof(0));
892  CHECK(RE2::FullMatch("123",
893                      "(\\d)(\\d)(\\d)",
894                      &a[0],  &a[1],  &a[2]));
895  CHECK_EQ(a[0], 1);
896  CHECK_EQ(a[1], 2);
897  CHECK_EQ(a[2], 3);
898
899  memset(a, 0, sizeof(0));
900  CHECK(RE2::FullMatch("1234",
901                      "(\\d)(\\d)(\\d)(\\d)",
902                      &a[0],  &a[1],  &a[2],  &a[3]));
903  CHECK_EQ(a[0], 1);
904  CHECK_EQ(a[1], 2);
905  CHECK_EQ(a[2], 3);
906  CHECK_EQ(a[3], 4);
907
908  memset(a, 0, sizeof(0));
909  CHECK(RE2::FullMatch("12345",
910                      "(\\d)(\\d)(\\d)(\\d)(\\d)",
911                      &a[0],  &a[1],  &a[2],  &a[3],
912                      &a[4]));
913  CHECK_EQ(a[0], 1);
914  CHECK_EQ(a[1], 2);
915  CHECK_EQ(a[2], 3);
916  CHECK_EQ(a[3], 4);
917  CHECK_EQ(a[4], 5);
918
919  memset(a, 0, sizeof(0));
920  CHECK(RE2::FullMatch("123456",
921                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
922                      &a[0],  &a[1],  &a[2],  &a[3],
923                      &a[4],  &a[5]));
924  CHECK_EQ(a[0], 1);
925  CHECK_EQ(a[1], 2);
926  CHECK_EQ(a[2], 3);
927  CHECK_EQ(a[3], 4);
928  CHECK_EQ(a[4], 5);
929  CHECK_EQ(a[5], 6);
930
931  memset(a, 0, sizeof(0));
932  CHECK(RE2::FullMatch("1234567",
933                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
934                      &a[0],  &a[1],  &a[2],  &a[3],
935                      &a[4],  &a[5],  &a[6]));
936  CHECK_EQ(a[0], 1);
937  CHECK_EQ(a[1], 2);
938  CHECK_EQ(a[2], 3);
939  CHECK_EQ(a[3], 4);
940  CHECK_EQ(a[4], 5);
941  CHECK_EQ(a[5], 6);
942  CHECK_EQ(a[6], 7);
943
944  memset(a, 0, sizeof(0));
945  CHECK(RE2::FullMatch("1234567890123456",
946                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
947                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
948                      &a[0],  &a[1],  &a[2],  &a[3],
949                      &a[4],  &a[5],  &a[6],  &a[7],
950                      &a[8],  &a[9],  &a[10], &a[11],
951                      &a[12], &a[13], &a[14], &a[15]));
952  CHECK_EQ(a[0], 1);
953  CHECK_EQ(a[1], 2);
954  CHECK_EQ(a[2], 3);
955  CHECK_EQ(a[3], 4);
956  CHECK_EQ(a[4], 5);
957  CHECK_EQ(a[5], 6);
958  CHECK_EQ(a[6], 7);
959  CHECK_EQ(a[7], 8);
960  CHECK_EQ(a[8], 9);
961  CHECK_EQ(a[9], 0);
962  CHECK_EQ(a[10], 1);
963  CHECK_EQ(a[11], 2);
964  CHECK_EQ(a[12], 3);
965  CHECK_EQ(a[13], 4);
966  CHECK_EQ(a[14], 5);
967  CHECK_EQ(a[15], 6);
968}
969
970TEST(RE2, Accessors) {
971  // Check the pattern() accessor
972  {
973    const string kPattern = "http://([^/]+)/.*";
974    const RE2 re(kPattern);
975    CHECK_EQ(kPattern, re.pattern());
976  }
977
978  // Check RE2 error field.
979  {
980    RE2 re("foo");
981    CHECK(re.error().empty());  // Must have no error
982    CHECK(re.ok());
983    CHECK(re.error_code() == RE2::NoError);
984  }
985}
986
987TEST(RE2, UTF8) {
988  // Check UTF-8 handling
989  // Three Japanese characters (nihongo)
990  const char utf8_string[] = {
991       0xe6, 0x97, 0xa5, // 65e5
992       0xe6, 0x9c, 0xac, // 627c
993       0xe8, 0xaa, 0x9e, // 8a9e
994       0
995  };
996  const char utf8_pattern[] = {
997       '.',
998       0xe6, 0x9c, 0xac, // 627c
999       '.',
1000       0
1001  };
1002
1003  // Both should match in either mode, bytes or UTF-8
1004  RE2 re_test1(".........", RE2::Latin1);
1005  CHECK(RE2::FullMatch(utf8_string, re_test1));
1006  RE2 re_test2("...");
1007  CHECK(RE2::FullMatch(utf8_string, re_test2));
1008
1009  // Check that '.' matches one byte or UTF-8 character
1010  // according to the mode.
1011  string s;
1012  RE2 re_test3("(.)", RE2::Latin1);
1013  CHECK(RE2::PartialMatch(utf8_string, re_test3, &s));
1014  CHECK_EQ(s, string("\xe6"));
1015  RE2 re_test4("(.)");
1016  CHECK(RE2::PartialMatch(utf8_string, re_test4, &s));
1017  CHECK_EQ(s, string("\xe6\x97\xa5"));
1018
1019  // Check that string matches itself in either mode
1020  RE2 re_test5(utf8_string, RE2::Latin1);
1021  CHECK(RE2::FullMatch(utf8_string, re_test5));
1022  RE2 re_test6(utf8_string);
1023  CHECK(RE2::FullMatch(utf8_string, re_test6));
1024
1025  // Check that pattern matches string only in UTF8 mode
1026  RE2 re_test7(utf8_pattern, RE2::Latin1);
1027  CHECK(!RE2::FullMatch(utf8_string, re_test7));
1028  RE2 re_test8(utf8_pattern);
1029  CHECK(RE2::FullMatch(utf8_string, re_test8));
1030}
1031
1032TEST(RE2, UngreedyUTF8) {
1033  // Check that ungreedy, UTF8 regular expressions don't match when they
1034  // oughtn't -- see bug 82246.
1035  {
1036    // This code always worked.
1037    const char* pattern = "\\w+X";
1038    const string target = "a aX";
1039    RE2 match_sentence(pattern, RE2::Latin1);
1040    RE2 match_sentence_re(pattern);
1041
1042    CHECK(!RE2::FullMatch(target, match_sentence));
1043    CHECK(!RE2::FullMatch(target, match_sentence_re));
1044  }
1045  {
1046    const char* pattern = "(?U)\\w+X";
1047    const string target = "a aX";
1048    RE2 match_sentence(pattern, RE2::Latin1);
1049    CHECK_EQ(match_sentence.error(), "");
1050    RE2 match_sentence_re(pattern);
1051
1052    CHECK(!RE2::FullMatch(target, match_sentence));
1053    CHECK(!RE2::FullMatch(target, match_sentence_re));
1054  }
1055}
1056
1057TEST(RE2, Rejects) {
1058  { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); }
1059  {
1060    RE2 re("a[x", RE2::Quiet);
1061    CHECK(!re.ok());
1062  }
1063  {
1064    RE2 re("a[z-a]", RE2::Quiet);
1065    CHECK(!re.ok());
1066  }
1067  {
1068    RE2 re("a[[:foobar:]]", RE2::Quiet);
1069    CHECK(!re.ok());
1070  }
1071  {
1072    RE2 re("a(b", RE2::Quiet);
1073    CHECK(!re.ok());
1074  }
1075  {
1076    RE2 re("a\\", RE2::Quiet);
1077    CHECK(!re.ok());
1078  }
1079}
1080
1081TEST(RE2, NoCrash) {
1082  // Test that using a bad regexp doesn't crash.
1083  {
1084    RE2 re("a\\", RE2::Quiet);
1085    CHECK(!re.ok());
1086    CHECK(!RE2::PartialMatch("a\\b", re));
1087  }
1088
1089  // Test that using an enormous regexp doesn't crash
1090  {
1091    RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1092    CHECK(!re.ok());
1093    CHECK(!RE2::PartialMatch("aaa", re));
1094  }
1095
1096  // Test that a crazy regexp still compiles and runs.
1097  {
1098    RE2 re(".{512}x", RE2::Quiet);
1099    CHECK(re.ok());
1100    string s;
1101    s.append(515, 'c');
1102    s.append("x");
1103    CHECK(RE2::PartialMatch(s, re));
1104  }
1105}
1106
1107TEST(RE2, Recursion) {
1108  // Test that recursion is stopped.
1109  // This test is PCRE-legacy -- there's no recursion in RE2.
1110  int bytes = 15 * 1024;  // enough to crash PCRE
1111  TestRecursion(bytes, ".");
1112  TestRecursion(bytes, "a");
1113  TestRecursion(bytes, "a.");
1114  TestRecursion(bytes, "ab.");
1115  TestRecursion(bytes, "abc.");
1116}
1117
1118TEST(RE2, BigCountedRepetition) {
1119  // Test that counted repetition works, given tons of memory.
1120  RE2::Options opt;
1121  opt.set_max_mem(256<<20);
1122
1123  RE2 re(".{512}x", opt);
1124  CHECK(re.ok());
1125  string s;
1126  s.append(515, 'c');
1127  s.append("x");
1128  CHECK(RE2::PartialMatch(s, re));
1129}
1130
1131TEST(RE2, DeepRecursion) {
1132  // Test for deep stack recursion.  This would fail with a
1133  // segmentation violation due to stack overflow before pcre was
1134  // patched.
1135  // Again, a PCRE legacy test.  RE2 doesn't recurse.
1136  string comment("x*");
1137  string a(131072, 'a');
1138  comment += a;
1139  comment += "*x";
1140  RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1141  CHECK(RE2::FullMatch(comment, re));
1142}
1143
1144// Suggested by Josh Hyman.  Failed when SearchOnePass was
1145// not implementing case-folding.
1146TEST(CaseInsensitive, MatchAndConsume) {
1147  string result;
1148  string text = "A fish named *Wanda*";
1149  StringPiece sp(text);
1150
1151  EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result));
1152  EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1153}
1154
1155// RE2 should permit implicit conversions from string, StringPiece, const char*,
1156// and C string literals.
1157TEST(RE2, ImplicitConversions) {
1158  string re_string(".");
1159  StringPiece re_stringpiece(".");
1160  const char* re_cstring = ".";
1161  EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1162  EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
1163  EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
1164  EXPECT_TRUE(RE2::PartialMatch("e", "."));
1165}
1166
1167// Bugs introduced by 8622304
1168TEST(RE2, CL8622304) {
1169  // reported by ingow
1170  string dir;
1171  EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])"));  // ok
1172  EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir));  // fails
1173
1174  // reported by jacobsa
1175  string key, val;
1176  EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1177              "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1178              &key,
1179              &val));
1180  EXPECT_EQ(key, "bar");
1181  EXPECT_EQ(val, "1,0x2F,030,4,5");
1182}
1183
1184
1185// Check that RE2 returns correct regexp pieces on error.
1186// In particular, make sure it returns whole runes
1187// and that it always reports invalid UTF-8.
1188// Also check that Perl error flag piece is big enough.
1189static struct ErrorTest {
1190  const char *regexp;
1191  const char *error;
1192} error_tests[] = {
1193  { "ab\\αcd", "\\α" },
1194  { "ef\\x☺01", "\\x☺0" },
1195  { "gh\\x1☺01", "\\x1☺" },
1196  { "ij\\x1", "\\x1" },
1197  { "kl\\x", "\\x" },
1198  { "uv\\x{0000☺}", "\\x{0000☺" },
1199  { "wx\\p{ABC", "\\p{ABC" },
1200  { "yz(?smiUX:abc)", "(?smiUX" },   // used to return (?s but the error is X
1201  { "aa(?sm☺i", "(?sm☺" },
1202  { "bb[abc", "[abc" },
1203
1204  { "mn\\x1\377", "" },  // no argument string returned for invalid UTF-8
1205  { "op\377qr", "" },
1206  { "st\\x{00000\377", "" },
1207  { "zz\\p{\377}", "" },
1208  { "zz\\x{00\377}", "" },
1209  { "zz(?P<name\377>abc)", "" },
1210};
1211TEST(RE2, ErrorArgs) {
1212  for (int i = 0; i < arraysize(error_tests); i++) {
1213    RE2 re(error_tests[i].regexp, RE2::Quiet);
1214    EXPECT_FALSE(re.ok());
1215    EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
1216  }
1217}
1218
1219// Check that "never match \n" mode never matches \n.
1220static struct NeverTest {
1221  const char* regexp;
1222  const char* text;
1223  const char* match;
1224} never_tests[] = {
1225  { "(.*)", "abc\ndef\nghi\n", "abc" },
1226  { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1227  { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1228  { "(abc[^x]*def)", "abc\ndef\n", NULL },
1229  { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1230};
1231TEST(RE2, NeverNewline) {
1232  RE2::Options opt;
1233  opt.set_never_nl(true);
1234  for (int i = 0; i < arraysize(never_tests); i++) {
1235    const NeverTest& t = never_tests[i];
1236    RE2 re(t.regexp, opt);
1237    if (t.match == NULL) {
1238      EXPECT_FALSE(re.PartialMatch(t.text, re));
1239    } else {
1240      StringPiece m;
1241      EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1242      EXPECT_EQ(m, t.match);
1243    }
1244  }
1245}
1246
1247// Check that there are no capturing groups in "never capture" mode.
1248TEST(RE2, NeverCapture) {
1249  RE2::Options opt;
1250  opt.set_never_capture(true);
1251  RE2 re("(r)(e)", opt);
1252  EXPECT_EQ(0, re.NumberOfCapturingGroups());
1253}
1254
1255// Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1256// Triggered by a failed DFA search falling back to Bitstate when
1257// using Match with a NULL submatch set.  Bitstate tried to read
1258// the submatch[0] entry even if nsubmatch was 0.
1259TEST(RE2, BitstateCaptureBug) {
1260  RE2::Options opt;
1261  opt.set_max_mem(20000);
1262  RE2 re("(_________$)", opt);
1263  StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1264  EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1265}
1266
1267// C++ version of bug 609710.
1268TEST(RE2, UnicodeClasses) {
1269  const string str = "ABCDEFGHI譚永鋒";
1270  string a, b, c;
1271
1272  EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1273  EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1274  EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1275  EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1276  EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1277  EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1278
1279  EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1280  EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1281  EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1282  EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1283  EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1284  EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1285
1286  EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1287  EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1288  EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1289  EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1290  EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1291  EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1292
1293  EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1294  EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1295  EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1296  EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1297  EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1298  EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1299
1300  EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1301  EXPECT_EQ("A", a);
1302  EXPECT_EQ("B", b);
1303  EXPECT_EQ("C", c);
1304
1305  EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1306  EXPECT_EQ("A", a);
1307  EXPECT_EQ("B", b);
1308  EXPECT_EQ("C", c);
1309
1310  EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1311
1312  EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1313  EXPECT_EQ("A", a);
1314  EXPECT_EQ("B", b);
1315  EXPECT_EQ("C", c);
1316
1317  EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1318
1319  EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1320  EXPECT_EQ("譚", a);
1321  EXPECT_EQ("永", b);
1322  EXPECT_EQ("鋒", c);
1323}
1324
1325// Bug reported by saito. 2009/02/17
1326TEST(RE2, NullVsEmptyString) {
1327  RE2 re2(".*");
1328  StringPiece v1("");
1329  EXPECT_TRUE(RE2::FullMatch(v1, re2));
1330
1331  StringPiece v2;
1332  EXPECT_TRUE(RE2::FullMatch(v2, re2));
1333}
1334
1335// Issue 1816809
1336TEST(RE2, Bug1816809) {
1337  RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1338  StringPiece piece("llx-3;llx4");
1339  string x;
1340  EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1341}
1342
1343// Issue 3061120
1344TEST(RE2, Bug3061120) {
1345  RE2 re("(?i)\\W");
1346  EXPECT_FALSE(RE2::PartialMatch("x", re));  // always worked
1347  EXPECT_FALSE(RE2::PartialMatch("k", re));  // broke because of kelvin
1348  EXPECT_FALSE(RE2::PartialMatch("s", re));  // broke because of latin long s
1349}
1350
1351TEST(RE2, CapturingGroupNames) {
1352  // Opening parentheses annotated with group IDs:
1353  //      12    3        45   6         7
1354  RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1355  EXPECT_TRUE(re.ok());
1356  const map<int, string>& have = re.CapturingGroupNames();
1357  map<int, string> want;
1358  want[3] = "G2";
1359  want[6] = "G2";
1360  want[7] = "G1";
1361  EXPECT_EQ(want, have);
1362}
1363
1364TEST(RE2, RegexpToStringLossOfAnchor) {
1365  EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1366  EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1367  EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1368  EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1369}
1370
1371}  // namespace re2
1372