1// -*- coding: utf-8 -*-
2// Copyright 2002-2009 The RE2 Authors.  All Rights Reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// TODO: Test extractions for PartialMatch/Consume
7
8#include <sys/types.h>
9#ifndef WIN32
10#include <sys/mman.h>
11#endif
12#include <sys/stat.h>
13#include <errno.h>
14#include <vector>
15#include "util/test.h"
16#include "re2/re2.h"
17#include "re2/regexp.h"
18
19#ifdef WIN32
20#include <stdio.h>
21#define snprintf _snprintf
22#endif
23
24DECLARE_bool(logtostderr);
25
26namespace re2 {
27
28TEST(RE2, HexTests) {
29
30  VLOG(1) << "hex tests";
31
32#define CHECK_HEX(type, value) \
33  do { \
34    type v; \
35    CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
36    CHECK_EQ(v, 0x ## value); \
37    CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
38    CHECK_EQ(v, 0x ## value); \
39  } while(0)
40
41  CHECK_HEX(short,              2bad);
42  CHECK_HEX(unsigned short,     2badU);
43  CHECK_HEX(int,                dead);
44  CHECK_HEX(unsigned int,       deadU);
45  CHECK_HEX(long,               7eadbeefL);
46  CHECK_HEX(unsigned long,      deadbeefUL);
47  CHECK_HEX(long long,          12345678deadbeefLL);
48  CHECK_HEX(unsigned long long, cafebabedeadbeefULL);
49
50#undef CHECK_HEX
51}
52
53TEST(RE2, OctalTests) {
54  VLOG(1) << "octal tests";
55
56#define CHECK_OCTAL(type, value) \
57  do { \
58    type v; \
59    CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
60    CHECK_EQ(v, 0 ## value); \
61    CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
62    CHECK_EQ(v, 0 ## value); \
63  } while(0)
64
65  CHECK_OCTAL(short,              77777);
66  CHECK_OCTAL(unsigned short,     177777U);
67  CHECK_OCTAL(int,                17777777777);
68  CHECK_OCTAL(unsigned int,       37777777777U);
69  CHECK_OCTAL(long,               17777777777L);
70  CHECK_OCTAL(unsigned long,      37777777777UL);
71  CHECK_OCTAL(long long,          777777777777777777777LL);
72  CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL);
73
74#undef CHECK_OCTAL
75}
76
77TEST(RE2, DecimalTests) {
78  VLOG(1) << "decimal tests";
79
80#define CHECK_DECIMAL(type, value) \
81  do { \
82    type v; \
83    CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
84    CHECK_EQ(v, value); \
85    CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
86    CHECK_EQ(v, value); \
87  } while(0)
88
89  CHECK_DECIMAL(short,              -1);
90  CHECK_DECIMAL(unsigned short,     9999);
91  CHECK_DECIMAL(int,                -1000);
92  CHECK_DECIMAL(unsigned int,       12345U);
93  CHECK_DECIMAL(long,               -10000000L);
94  CHECK_DECIMAL(unsigned long,      3083324652U);
95  CHECK_DECIMAL(long long,          -100000000000000LL);
96  CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL);
97
98#undef CHECK_DECIMAL
99}
100
101TEST(RE2, Replace) {
102  VLOG(1) << "TestReplace";
103
104  struct ReplaceTest {
105    const char *regexp;
106    const char *rewrite;
107    const char *original;
108    const char *single;
109    const char *global;
110    int        greplace_count;
111  };
112  static const ReplaceTest tests[] = {
113    { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
114      "\\2\\1ay",
115      "the quick brown fox jumps over the lazy dogs.",
116      "ethay quick brown fox jumps over the lazy dogs.",
117      "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
118      9 },
119    { "\\w+",
120      "\\0-NOSPAM",
121      "abcd.efghi@google.com",
122      "abcd-NOSPAM.efghi@google.com",
123      "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM",
124      4 },
125    { "^",
126      "(START)",
127      "foo",
128      "(START)foo",
129      "(START)foo",
130      1 },
131    { "^",
132      "(START)",
133      "",
134      "(START)",
135      "(START)",
136      1 },
137    { "$",
138      "(END)",
139      "",
140      "(END)",
141      "(END)",
142      1 },
143    { "b",
144      "bb",
145      "ababababab",
146      "abbabababab",
147      "abbabbabbabbabb",
148      5 },
149    { "b",
150      "bb",
151      "bbbbbb",
152      "bbbbbbb",
153      "bbbbbbbbbbbb",
154      6 },
155    { "b+",
156      "bb",
157      "bbbbbb",
158      "bb",
159      "bb",
160      1 },
161    { "b*",
162      "bb",
163      "bbbbbb",
164      "bb",
165      "bb",
166      1 },
167    { "b*",
168      "bb",
169      "aaaaa",
170      "bbaaaaa",
171      "bbabbabbabbabbabb",
172      6 },
173    // Check newline handling
174    { "a.*a",
175      "(\\0)",
176      "aba\naba",
177      "(aba)\naba",
178      "(aba)\n(aba)",
179      2 },
180    { "", NULL, NULL, NULL, NULL, 0 }
181  };
182
183  for (const ReplaceTest *t = tests; t->original != NULL; ++t) {
184    VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite);
185    string one(t->original);
186    CHECK(RE2::Replace(&one, t->regexp, t->rewrite));
187    CHECK_EQ(one, t->single);
188    string all(t->original);
189    CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
190      << "Got: " << all;
191    CHECK_EQ(all, t->global);
192  }
193}
194
195static void TestCheckRewriteString(const char* regexp, const char* rewrite,
196                              bool expect_ok) {
197  string error;
198  RE2 exp(regexp);
199  bool actual_ok = exp.CheckRewriteString(rewrite, &error);
200  EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
201}
202
203TEST(CheckRewriteString, all) {
204  TestCheckRewriteString("abc", "foo", true);
205  TestCheckRewriteString("abc", "foo\\", false);
206  TestCheckRewriteString("abc", "foo\\0bar", true);
207
208  TestCheckRewriteString("a(b)c", "foo", true);
209  TestCheckRewriteString("a(b)c", "foo\\0bar", true);
210  TestCheckRewriteString("a(b)c", "foo\\1bar", true);
211  TestCheckRewriteString("a(b)c", "foo\\2bar", false);
212  TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
213
214  TestCheckRewriteString("a(b)(c)", "foo\\12", true);
215  TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
216  TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
217}
218
219TEST(RE2, Extract) {
220  VLOG(1) << "TestExtract";
221
222  string s;
223
224  CHECK(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s));
225  CHECK_EQ(s, "kremvax!boris");
226
227  CHECK(RE2::Extract("foo", ".*", "'\\0'", &s));
228  CHECK_EQ(s, "'foo'");
229  // check that false match doesn't overwrite
230  CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s));
231  CHECK_EQ(s, "'foo'");
232}
233
234TEST(RE2, Consume) {
235  VLOG(1) << "TestConsume";
236
237  RE2 r("\\s*(\\w+)");    // matches a word, possibly proceeded by whitespace
238  string word;
239
240  string s("   aaa b!@#$@#$cccc");
241  StringPiece input(s);
242
243  CHECK(RE2::Consume(&input, r, &word));
244  CHECK_EQ(word, "aaa") << " input: " << input;
245  CHECK(RE2::Consume(&input, r, &word));
246  CHECK_EQ(word, "b") << " input: " << input;
247  CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input;
248}
249
250TEST(RE2, ConsumeN) {
251  const string s(" one two three 4");
252  StringPiece input(s);
253
254  RE2::Arg argv[2];
255  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
256
257  // 0 arg
258  EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0));  // Skips "one".
259
260  // 1 arg
261  string word;
262  argv[0] = &word;
263  EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
264  EXPECT_EQ("two", word);
265
266  // Multi-args
267  int n;
268  argv[1] = &n;
269  EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
270  EXPECT_EQ("three", word);
271  EXPECT_EQ(4, n);
272}
273
274TEST(RE2, FindAndConsume) {
275  VLOG(1) << "TestFindAndConsume";
276
277  RE2 r("(\\w+)");      // matches a word
278  string word;
279
280  string s("   aaa b!@#$@#$cccc");
281  StringPiece input(s);
282
283  CHECK(RE2::FindAndConsume(&input, r, &word));
284  CHECK_EQ(word, "aaa");
285  CHECK(RE2::FindAndConsume(&input, r, &word));
286  CHECK_EQ(word, "b");
287  CHECK(RE2::FindAndConsume(&input, r, &word));
288  CHECK_EQ(word, "cccc");
289  CHECK(! RE2::FindAndConsume(&input, r, &word));
290
291  // Check that FindAndConsume works without any submatches.
292  // Earlier version used uninitialized data for
293  // length to consume.
294  input = "aaa";
295  CHECK(RE2::FindAndConsume(&input, "aaa"));
296  CHECK_EQ(input, "");
297}
298
299TEST(RE2, FindAndConsumeN) {
300  const string s(" one two three 4");
301  StringPiece input(s);
302
303  RE2::Arg argv[2];
304  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
305
306  // 0 arg
307  EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0));  // Skips "one".
308
309  // 1 arg
310  string word;
311  argv[0] = &word;
312  EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
313  EXPECT_EQ("two", word);
314
315  // Multi-args
316  int n;
317  argv[1] = &n;
318  EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
319  EXPECT_EQ("three", word);
320  EXPECT_EQ(4, n);
321}
322
323TEST(RE2, MatchNumberPeculiarity) {
324  VLOG(1) << "TestMatchNumberPeculiarity";
325
326  RE2 r("(foo)|(bar)|(baz)");
327  string word1;
328  string word2;
329  string word3;
330
331  CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
332  CHECK_EQ(word1, "foo");
333  CHECK_EQ(word2, "");
334  CHECK_EQ(word3, "");
335  CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
336  CHECK_EQ(word1, "");
337  CHECK_EQ(word2, "bar");
338  CHECK_EQ(word3, "");
339  CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
340  CHECK_EQ(word1, "");
341  CHECK_EQ(word2, "");
342  CHECK_EQ(word3, "baz");
343  CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3));
344
345  string a;
346  CHECK(RE2::FullMatch("hello", "(foo)|hello", &a));
347  CHECK_EQ(a, "");
348}
349
350TEST(RE2, Match) {
351  RE2 re("((\\w+):([0-9]+))");   // extracts host and port
352  StringPiece group[4];
353
354  // No match.
355  StringPiece s = "zyzzyva";
356  CHECK(!re.Match(s, 0, s.size(), RE2::UNANCHORED,
357                  group, arraysize(group)));
358
359  // Matches and extracts.
360  s = "a chrisr:9000 here";
361  CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED,
362                 group, arraysize(group)));
363  CHECK_EQ(group[0], "chrisr:9000");
364  CHECK_EQ(group[1], "chrisr:9000");
365  CHECK_EQ(group[2], "chrisr");
366  CHECK_EQ(group[3], "9000");
367
368  string all, host;
369  int port;
370  CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
371  CHECK_EQ(all, "chrisr:9000");
372  CHECK_EQ(host, "chrisr");
373  CHECK_EQ(port, 9000);
374}
375
376static void TestRecursion(int size, const char *pattern) {
377  // Fill up a string repeating the pattern given
378  string domain;
379  domain.resize(size);
380  int patlen = strlen(pattern);
381  for (int i = 0; i < size; ++i) {
382    domain[i] = pattern[i % patlen];
383  }
384  // Just make sure it doesn't crash due to too much recursion.
385  RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
386  RE2::FullMatch(domain, re);
387}
388
389// A meta-quoted string, interpreted as a pattern, should always match
390// the original unquoted string.
391static void TestQuoteMeta(string unquoted,
392                          const RE2::Options& options = RE2::DefaultOptions) {
393  string quoted = RE2::QuoteMeta(unquoted);
394  RE2 re(quoted, options);
395  EXPECT_TRUE_M(RE2::FullMatch(unquoted, re),
396                "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
397}
398
399// A meta-quoted string, interpreted as a pattern, should always match
400// the original unquoted string.
401static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
402                                  const RE2::Options& options = RE2::DefaultOptions) {
403  string quoted = RE2::QuoteMeta(unquoted);
404  RE2 re(quoted, options);
405  EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re),
406                 "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
407}
408
409// Tests that quoted meta characters match their original strings,
410// and that a few things that shouldn't match indeed do not.
411TEST(QuoteMeta, Simple) {
412  TestQuoteMeta("foo");
413  TestQuoteMeta("foo.bar");
414  TestQuoteMeta("foo\\.bar");
415  TestQuoteMeta("[1-9]");
416  TestQuoteMeta("1.5-2.0?");
417  TestQuoteMeta("\\d");
418  TestQuoteMeta("Who doesn't like ice cream?");
419  TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
420  TestQuoteMeta("((?!)xxx).*yyy");
421  TestQuoteMeta("([");
422}
423TEST(QuoteMeta, SimpleNegative) {
424  NegativeTestQuoteMeta("foo", "bar");
425  NegativeTestQuoteMeta("...", "bar");
426  NegativeTestQuoteMeta("\\.", ".");
427  NegativeTestQuoteMeta("\\.", "..");
428  NegativeTestQuoteMeta("(a)", "a");
429  NegativeTestQuoteMeta("(a|b)", "a");
430  NegativeTestQuoteMeta("(a|b)", "(a)");
431  NegativeTestQuoteMeta("(a|b)", "a|b");
432  NegativeTestQuoteMeta("[0-9]", "0");
433  NegativeTestQuoteMeta("[0-9]", "0-9");
434  NegativeTestQuoteMeta("[0-9]", "[9]");
435  NegativeTestQuoteMeta("((?!)xxx)", "xxx");
436}
437
438TEST(QuoteMeta, Latin1) {
439  TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
440}
441
442TEST(QuoteMeta, UTF8) {
443  TestQuoteMeta("Plácido Domingo");
444  TestQuoteMeta("xyz");  // No fancy utf8.
445  TestQuoteMeta("\xc2\xb0");  // 2-byte utf8 -- a degree symbol.
446  TestQuoteMeta("27\xc2\xb0 degrees");  // As a middle character.
447  TestQuoteMeta("\xe2\x80\xb3");  // 3-byte utf8 -- a double prime.
448  TestQuoteMeta("\xf0\x9d\x85\x9f");  // 4-byte utf8 -- a music note.
449  TestQuoteMeta("27\xc2\xb0");  // Interpreted as Latin-1, this should
450                                // still work.
451  NegativeTestQuoteMeta("27\xc2\xb0",
452                        "27\\\xc2\\\xb0");  // 2-byte utf8 -- a degree symbol.
453}
454
455TEST(QuoteMeta, HasNull) {
456  string has_null;
457
458  // string with one null character
459  has_null += '\0';
460  TestQuoteMeta(has_null);
461  NegativeTestQuoteMeta(has_null, "");
462
463  // Don't want null-followed-by-'1' to be interpreted as '\01'.
464  has_null += '1';
465  TestQuoteMeta(has_null);
466  NegativeTestQuoteMeta(has_null, "\1");
467}
468
469TEST(ProgramSize, BigProgram) {
470  RE2 re_simple("simple regexp");
471  RE2 re_medium("medium.*regexp");
472  RE2 re_complex("hard.{1,128}regexp");
473
474  CHECK_GT(re_simple.ProgramSize(), 0);
475  CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
476  CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
477}
478
479// Issue 956519: handling empty character sets was
480// causing NULL dereference.  This tests a few empty character sets.
481// (The way to get an empty character set is to negate a full one.)
482TEST(EmptyCharset, Fuzz) {
483  static const char *empties[] = {
484    "[^\\S\\s]",
485    "[^\\S[:space:]]",
486    "[^\\D\\d]",
487    "[^\\D[:digit:]]"
488  };
489  for (int i = 0; i < arraysize(empties); i++)
490    CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
491}
492
493// Test that named groups work correctly.
494TEST(Capture, NamedGroups) {
495  {
496    RE2 re("(hello world)");
497    CHECK_EQ(re.NumberOfCapturingGroups(), 1);
498    const map<string, int>& m = re.NamedCapturingGroups();
499    CHECK_EQ(m.size(), 0);
500  }
501
502  {
503    RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
504    CHECK_EQ(re.NumberOfCapturingGroups(), 6);
505    const map<string, int>& m = re.NamedCapturingGroups();
506    CHECK_EQ(m.size(), 4);
507    CHECK_EQ(m.find("A")->second, 1);
508    CHECK_EQ(m.find("B")->second, 2);
509    CHECK_EQ(m.find("C")->second, 3);
510    CHECK_EQ(m.find("D")->second, 6);  // $4 and $5 are anonymous
511  }
512}
513
514TEST(RE2, FullMatchWithNoArgs) {
515  CHECK(RE2::FullMatch("h", "h"));
516  CHECK(RE2::FullMatch("hello", "hello"));
517  CHECK(RE2::FullMatch("hello", "h.*o"));
518  CHECK(!RE2::FullMatch("othello", "h.*o"));       // Must be anchored at front
519  CHECK(!RE2::FullMatch("hello!", "h.*o"));        // Must be anchored at end
520}
521
522TEST(RE2, PartialMatch) {
523  CHECK(RE2::PartialMatch("x", "x"));
524  CHECK(RE2::PartialMatch("hello", "h.*o"));
525  CHECK(RE2::PartialMatch("othello", "h.*o"));
526  CHECK(RE2::PartialMatch("hello!", "h.*o"));
527  CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
528}
529
530TEST(RE2, PartialMatchN) {
531  RE2::Arg argv[2];
532  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
533
534  // 0 arg
535  EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
536  EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
537
538  // 1 arg
539  int i;
540  argv[0] = &i;
541  EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
542  EXPECT_EQ(1001, i);
543  EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
544
545  // Multi-arg
546  string s;
547  argv[1] = &s;
548  EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
549  EXPECT_EQ(42, i);
550  EXPECT_EQ("life", s);
551  EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
552}
553
554TEST(RE2, FullMatchZeroArg) {
555  // Zero-arg
556  CHECK(RE2::FullMatch("1001", "\\d+"));
557}
558
559TEST(RE2, FullMatchOneArg) {
560  int i;
561
562  // Single-arg
563  CHECK(RE2::FullMatch("1001", "(\\d+)",   &i));
564  CHECK_EQ(i, 1001);
565  CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i));
566  CHECK_EQ(i, -123);
567  CHECK(!RE2::FullMatch("10", "()\\d+", &i));
568  CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890",
569                       "(\\d+)", &i));
570}
571
572TEST(RE2, FullMatchIntegerArg) {
573  int i;
574
575  // Digits surrounding integer-arg
576  CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i));
577  CHECK_EQ(i, 23);
578  CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i));
579  CHECK_EQ(i, 1);
580  CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
581  CHECK_EQ(i, -1);
582  CHECK(RE2::PartialMatch("1234", "(\\d)", &i));
583  CHECK_EQ(i, 1);
584  CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i));
585  CHECK_EQ(i, -1);
586}
587
588TEST(RE2, FullMatchStringArg) {
589  string s;
590  // String-arg
591  CHECK(RE2::FullMatch("hello", "h(.*)o", &s));
592  CHECK_EQ(s, string("ell"));
593}
594
595TEST(RE2, FullMatchStringPieceArg) {
596  int i;
597  // StringPiece-arg
598  StringPiece sp;
599  CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
600  CHECK_EQ(sp.size(), 4);
601  CHECK(memcmp(sp.data(), "ruby", 4) == 0);
602  CHECK_EQ(i, 1234);
603}
604
605TEST(RE2, FullMatchMultiArg) {
606  int i;
607  string s;
608  // Multi-arg
609  CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
610  CHECK_EQ(s, string("ruby"));
611  CHECK_EQ(i, 1234);
612}
613
614TEST(RE2, FullMatchN) {
615  RE2::Arg argv[2];
616  const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
617
618  // 0 arg
619  EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
620  EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
621
622  // 1 arg
623  int i;
624  argv[0] = &i;
625  EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
626  EXPECT_EQ(1001, i);
627  EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
628
629  // Multi-arg
630  string s;
631  argv[1] = &s;
632  EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
633  EXPECT_EQ(42, i);
634  EXPECT_EQ("life", s);
635  EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
636}
637
638TEST(RE2, FullMatchIgnoredArg) {
639  int i;
640  string s;
641  // Ignored arg
642  CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
643  CHECK_EQ(s, string("ruby"));
644  CHECK_EQ(i, 1234);
645}
646
647TEST(RE2, FullMatchTypedNullArg) {
648  string s;
649
650  // Ignore non-void* NULL arg
651  CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
652  CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL));
653  CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
654  CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL));
655  CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
656  CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
657  CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
658
659  // Fail on non-void* NULL arg if the match doesn't parse for the given type.
660  CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
661  CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL));
662  CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
663  CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL));
664  CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL));
665}
666
667#ifndef WIN32
668// Check that numeric parsing code does not read past the end of
669// the number being parsed.
670TEST(RE2, NULTerminated) {
671  char *v;
672  int x;
673  long pagesize = sysconf(_SC_PAGE_SIZE);
674
675#ifndef MAP_ANONYMOUS
676#define MAP_ANONYMOUS MAP_ANON
677#endif
678  v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
679                              MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
680  CHECK(v != reinterpret_cast<char*>(-1));
681  LOG(INFO) << "Memory at " << (void*)v;
682  CHECK_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
683  v[pagesize - 1] = '1';
684
685  x = 0;
686  CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x));
687  CHECK_EQ(x, 1);
688}
689#endif
690
691TEST(RE2, FullMatchTypeTests) {
692  // Type tests
693  string zeros(100, '0');
694  {
695    char c;
696    CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
697    CHECK_EQ(c, 'H');
698  }
699  {
700    unsigned char c;
701    CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
702    CHECK_EQ(c, static_cast<unsigned char>('H'));
703  }
704  {
705    int16 v;
706    CHECK(RE2::FullMatch("100",     "(-?\\d+)", &v));    CHECK_EQ(v, 100);
707    CHECK(RE2::FullMatch("-100",    "(-?\\d+)", &v));    CHECK_EQ(v, -100);
708    CHECK(RE2::FullMatch("32767",   "(-?\\d+)", &v));    CHECK_EQ(v, 32767);
709    CHECK(RE2::FullMatch("-32768",  "(-?\\d+)", &v));    CHECK_EQ(v, -32768);
710    CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v));
711    CHECK(!RE2::FullMatch("32768",  "(-?\\d+)", &v));
712  }
713  {
714    uint16 v;
715    CHECK(RE2::FullMatch("100",     "(\\d+)", &v));    CHECK_EQ(v, 100);
716    CHECK(RE2::FullMatch("32767",   "(\\d+)", &v));    CHECK_EQ(v, 32767);
717    CHECK(RE2::FullMatch("65535",   "(\\d+)", &v));    CHECK_EQ(v, 65535);
718    CHECK(!RE2::FullMatch("65536",  "(\\d+)", &v));
719  }
720  {
721    int32 v;
722    static const int32 max = 0x7fffffff;
723    static const int32 min = -max - 1;
724    CHECK(RE2::FullMatch("100",          "(-?\\d+)", &v)); CHECK_EQ(v, 100);
725    CHECK(RE2::FullMatch("-100",         "(-?\\d+)", &v)); CHECK_EQ(v, -100);
726    CHECK(RE2::FullMatch("2147483647",   "(-?\\d+)", &v)); CHECK_EQ(v, max);
727    CHECK(RE2::FullMatch("-2147483648",  "(-?\\d+)", &v)); CHECK_EQ(v, min);
728    CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
729    CHECK(!RE2::FullMatch("2147483648",  "(-?\\d+)", &v));
730
731    CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
732    CHECK_EQ(v, max);
733    CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
734    CHECK_EQ(v, min);
735
736    CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
737    CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
738    CHECK_EQ(v, max);
739    CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
740  }
741  {
742    uint32 v;
743    static const uint32 max = 0xfffffffful;
744    CHECK(RE2::FullMatch("100",         "(\\d+)", &v)); CHECK_EQ(v, 100);
745    CHECK(RE2::FullMatch("4294967295",  "(\\d+)", &v)); CHECK_EQ(v, max);
746    CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v));
747    CHECK(!RE2::FullMatch("-1",         "(\\d+)", &v));
748
749    CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
750  }
751  {
752    int64 v;
753    static const int64 max = 0x7fffffffffffffffull;
754    static const int64 min = -max - 1;
755    char buf[32];
756
757    CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v)); CHECK_EQ(v, 100);
758    CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
759
760    snprintf(buf, sizeof(buf), "%lld", (long long int)max);
761    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
762
763    snprintf(buf, sizeof(buf), "%lld", (long long int)min);
764    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, min);
765
766    snprintf(buf, sizeof(buf), "%lld", (long long int)max);
767    assert(buf[strlen(buf)-1] != '9');
768    buf[strlen(buf)-1]++;
769    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
770
771    snprintf(buf, sizeof(buf), "%lld", (long long int)min);
772    assert(buf[strlen(buf)-1] != '9');
773    buf[strlen(buf)-1]++;
774    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
775  }
776  {
777    uint64 v;
778    int64 v2;
779    static const uint64 max = 0xffffffffffffffffull;
780    char buf[32];
781
782    CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v));  CHECK_EQ(v, 100);
783    CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100);
784
785    snprintf(buf, sizeof(buf), "%llu", (long long unsigned)max);
786    CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
787
788    assert(buf[strlen(buf)-1] != '9');
789    buf[strlen(buf)-1]++;
790    CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
791  }
792}
793
794TEST(RE2, FloatingPointFullMatchTypes) {
795  string zeros(100, '0');
796  {
797    float v;
798    CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
799    CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
800    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, float(1e23));
801
802    CHECK(RE2::FullMatch(zeros + "1e23",  "(.*)", &v));
803    CHECK_EQ(v, float(1e23));
804
805    // 6700000000081920.1 is an edge case.
806    // 6700000000081920 is exactly halfway between
807    // two float32s, so the .1 should make it round up.
808    // However, the .1 is outside the precision possible with
809    // a float64: the nearest float64 is 6700000000081920.
810    // So if the code uses strtod and then converts to float32,
811    // round-to-even will make it round down instead of up.
812    // To pass the test, the parser must call strtof directly.
813    // This test case is carefully chosen to use only a 17-digit
814    // number, since C does not guarantee to get the correctly
815    // rounded answer for strtod and strtof unless the input is
816    // short.
817    CHECK(RE2::FullMatch("0.1", "(.*)", &v));
818    CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
819    CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
820    CHECK_EQ(v, 6700000000081920.1f)
821      << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
822  }
823  {
824    double v;
825    CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
826    CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
827    CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, 1e23);
828    CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
829    CHECK_EQ(v, double(1e23));
830
831    CHECK(RE2::FullMatch("0.1", "(.*)", &v));
832    CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
833    CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
834    CHECK_EQ(v, 1.0000000596046448)
835      << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
836  }
837}
838
839TEST(RE2, FullMatchAnchored) {
840  int i;
841  // Check that matching is fully anchored
842  CHECK(!RE2::FullMatch("x1001", "(\\d+)",  &i));
843  CHECK(!RE2::FullMatch("1001x", "(\\d+)",  &i));
844  CHECK(RE2::FullMatch("x1001",  "x(\\d+)", &i)); CHECK_EQ(i, 1001);
845  CHECK(RE2::FullMatch("1001x",  "(\\d+)x", &i)); CHECK_EQ(i, 1001);
846}
847
848TEST(RE2, FullMatchBraces) {
849  // Braces
850  CHECK(RE2::FullMatch("0abcd",  "[0-9a-f+.-]{5,}"));
851  CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
852  CHECK(!RE2::FullMatch("0abc",  "[0-9a-f+.-]{5,}"));
853}
854
855TEST(RE2, Complicated) {
856  // Complicated RE2
857  CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
858  CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
859  CHECK(RE2::FullMatch("X",   "foo|bar|[A-Z]"));
860  CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]"));
861}
862
863TEST(RE2, FullMatchEnd) {
864  // Check full-match handling (needs '$' tacked on internally)
865  CHECK(RE2::FullMatch("fo", "fo|foo"));
866  CHECK(RE2::FullMatch("foo", "fo|foo"));
867  CHECK(RE2::FullMatch("fo", "fo|foo$"));
868  CHECK(RE2::FullMatch("foo", "fo|foo$"));
869  CHECK(RE2::FullMatch("foo", "foo$"));
870  CHECK(!RE2::FullMatch("foo$bar", "foo\\$"));
871  CHECK(!RE2::FullMatch("fox", "fo|bar"));
872
873  // Uncomment the following if we change the handling of '$' to
874  // prevent it from matching a trailing newline
875  if (false) {
876    // Check that we don't get bitten by pcre's special handling of a
877    // '\n' at the end of the string matching '$'
878    CHECK(!RE2::PartialMatch("foo\n", "foo$"));
879  }
880}
881
882TEST(RE2, FullMatchArgCount) {
883  // Number of args
884  int a[16];
885  CHECK(RE2::FullMatch("", ""));
886
887  memset(a, 0, sizeof(0));
888  CHECK(RE2::FullMatch("1",
889                      "(\\d){1}",
890                      &a[0]));
891  CHECK_EQ(a[0], 1);
892
893  memset(a, 0, sizeof(0));
894  CHECK(RE2::FullMatch("12",
895                      "(\\d)(\\d)",
896                      &a[0],  &a[1]));
897  CHECK_EQ(a[0], 1);
898  CHECK_EQ(a[1], 2);
899
900  memset(a, 0, sizeof(0));
901  CHECK(RE2::FullMatch("123",
902                      "(\\d)(\\d)(\\d)",
903                      &a[0],  &a[1],  &a[2]));
904  CHECK_EQ(a[0], 1);
905  CHECK_EQ(a[1], 2);
906  CHECK_EQ(a[2], 3);
907
908  memset(a, 0, sizeof(0));
909  CHECK(RE2::FullMatch("1234",
910                      "(\\d)(\\d)(\\d)(\\d)",
911                      &a[0],  &a[1],  &a[2],  &a[3]));
912  CHECK_EQ(a[0], 1);
913  CHECK_EQ(a[1], 2);
914  CHECK_EQ(a[2], 3);
915  CHECK_EQ(a[3], 4);
916
917  memset(a, 0, sizeof(0));
918  CHECK(RE2::FullMatch("12345",
919                      "(\\d)(\\d)(\\d)(\\d)(\\d)",
920                      &a[0],  &a[1],  &a[2],  &a[3],
921                      &a[4]));
922  CHECK_EQ(a[0], 1);
923  CHECK_EQ(a[1], 2);
924  CHECK_EQ(a[2], 3);
925  CHECK_EQ(a[3], 4);
926  CHECK_EQ(a[4], 5);
927
928  memset(a, 0, sizeof(0));
929  CHECK(RE2::FullMatch("123456",
930                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
931                      &a[0],  &a[1],  &a[2],  &a[3],
932                      &a[4],  &a[5]));
933  CHECK_EQ(a[0], 1);
934  CHECK_EQ(a[1], 2);
935  CHECK_EQ(a[2], 3);
936  CHECK_EQ(a[3], 4);
937  CHECK_EQ(a[4], 5);
938  CHECK_EQ(a[5], 6);
939
940  memset(a, 0, sizeof(0));
941  CHECK(RE2::FullMatch("1234567",
942                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
943                      &a[0],  &a[1],  &a[2],  &a[3],
944                      &a[4],  &a[5],  &a[6]));
945  CHECK_EQ(a[0], 1);
946  CHECK_EQ(a[1], 2);
947  CHECK_EQ(a[2], 3);
948  CHECK_EQ(a[3], 4);
949  CHECK_EQ(a[4], 5);
950  CHECK_EQ(a[5], 6);
951  CHECK_EQ(a[6], 7);
952
953  memset(a, 0, sizeof(0));
954  CHECK(RE2::FullMatch("1234567890123456",
955                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
956                      "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
957                      &a[0],  &a[1],  &a[2],  &a[3],
958                      &a[4],  &a[5],  &a[6],  &a[7],
959                      &a[8],  &a[9],  &a[10], &a[11],
960                      &a[12], &a[13], &a[14], &a[15]));
961  CHECK_EQ(a[0], 1);
962  CHECK_EQ(a[1], 2);
963  CHECK_EQ(a[2], 3);
964  CHECK_EQ(a[3], 4);
965  CHECK_EQ(a[4], 5);
966  CHECK_EQ(a[5], 6);
967  CHECK_EQ(a[6], 7);
968  CHECK_EQ(a[7], 8);
969  CHECK_EQ(a[8], 9);
970  CHECK_EQ(a[9], 0);
971  CHECK_EQ(a[10], 1);
972  CHECK_EQ(a[11], 2);
973  CHECK_EQ(a[12], 3);
974  CHECK_EQ(a[13], 4);
975  CHECK_EQ(a[14], 5);
976  CHECK_EQ(a[15], 6);
977}
978
979TEST(RE2, Accessors) {
980  // Check the pattern() accessor
981  {
982    const string kPattern = "http://([^/]+)/.*";
983    const RE2 re(kPattern);
984    CHECK_EQ(kPattern, re.pattern());
985  }
986
987  // Check RE2 error field.
988  {
989    RE2 re("foo");
990    CHECK(re.error().empty());  // Must have no error
991    CHECK(re.ok());
992    CHECK(re.error_code() == RE2::NoError);
993  }
994}
995
996TEST(RE2, UTF8) {
997  // Check UTF-8 handling
998  // Three Japanese characters (nihongo)
999  const char utf8_string[] = {
1000       0xe6, 0x97, 0xa5, // 65e5
1001       0xe6, 0x9c, 0xac, // 627c
1002       0xe8, 0xaa, 0x9e, // 8a9e
1003       0
1004  };
1005  const char utf8_pattern[] = {
1006       '.',
1007       0xe6, 0x9c, 0xac, // 627c
1008       '.',
1009       0
1010  };
1011
1012  // Both should match in either mode, bytes or UTF-8
1013  RE2 re_test1(".........", RE2::Latin1);
1014  CHECK(RE2::FullMatch(utf8_string, re_test1));
1015  RE2 re_test2("...");
1016  CHECK(RE2::FullMatch(utf8_string, re_test2));
1017
1018  // Check that '.' matches one byte or UTF-8 character
1019  // according to the mode.
1020  string s;
1021  RE2 re_test3("(.)", RE2::Latin1);
1022  CHECK(RE2::PartialMatch(utf8_string, re_test3, &s));
1023  CHECK_EQ(s, string("\xe6"));
1024  RE2 re_test4("(.)");
1025  CHECK(RE2::PartialMatch(utf8_string, re_test4, &s));
1026  CHECK_EQ(s, string("\xe6\x97\xa5"));
1027
1028  // Check that string matches itself in either mode
1029  RE2 re_test5(utf8_string, RE2::Latin1);
1030  CHECK(RE2::FullMatch(utf8_string, re_test5));
1031  RE2 re_test6(utf8_string);
1032  CHECK(RE2::FullMatch(utf8_string, re_test6));
1033
1034  // Check that pattern matches string only in UTF8 mode
1035  RE2 re_test7(utf8_pattern, RE2::Latin1);
1036  CHECK(!RE2::FullMatch(utf8_string, re_test7));
1037  RE2 re_test8(utf8_pattern);
1038  CHECK(RE2::FullMatch(utf8_string, re_test8));
1039}
1040
1041TEST(RE2, UngreedyUTF8) {
1042  // Check that ungreedy, UTF8 regular expressions don't match when they
1043  // oughtn't -- see bug 82246.
1044  {
1045    // This code always worked.
1046    const char* pattern = "\\w+X";
1047    const string target = "a aX";
1048    RE2 match_sentence(pattern, RE2::Latin1);
1049    RE2 match_sentence_re(pattern);
1050
1051    CHECK(!RE2::FullMatch(target, match_sentence));
1052    CHECK(!RE2::FullMatch(target, match_sentence_re));
1053  }
1054  {
1055    const char* pattern = "(?U)\\w+X";
1056    const string target = "a aX";
1057    RE2 match_sentence(pattern, RE2::Latin1);
1058    CHECK_EQ(match_sentence.error(), "");
1059    RE2 match_sentence_re(pattern);
1060
1061    CHECK(!RE2::FullMatch(target, match_sentence));
1062    CHECK(!RE2::FullMatch(target, match_sentence_re));
1063  }
1064}
1065
1066TEST(RE2, Rejects) {
1067  { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); }
1068  {
1069    RE2 re("a[x", RE2::Quiet);
1070    CHECK(!re.ok());
1071  }
1072  {
1073    RE2 re("a[z-a]", RE2::Quiet);
1074    CHECK(!re.ok());
1075  }
1076  {
1077    RE2 re("a[[:foobar:]]", RE2::Quiet);
1078    CHECK(!re.ok());
1079  }
1080  {
1081    RE2 re("a(b", RE2::Quiet);
1082    CHECK(!re.ok());
1083  }
1084  {
1085    RE2 re("a\\", RE2::Quiet);
1086    CHECK(!re.ok());
1087  }
1088}
1089
1090TEST(RE2, NoCrash) {
1091  // Test that using a bad regexp doesn't crash.
1092  {
1093    RE2 re("a\\", RE2::Quiet);
1094    CHECK(!re.ok());
1095    CHECK(!RE2::PartialMatch("a\\b", re));
1096  }
1097
1098  // Test that using an enormous regexp doesn't crash
1099  {
1100    RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1101    CHECK(!re.ok());
1102    CHECK(!RE2::PartialMatch("aaa", re));
1103  }
1104
1105  // Test that a crazy regexp still compiles and runs.
1106  {
1107    RE2 re(".{512}x", RE2::Quiet);
1108    CHECK(re.ok());
1109    string s;
1110    s.append(515, 'c');
1111    s.append("x");
1112    CHECK(RE2::PartialMatch(s, re));
1113  }
1114}
1115
1116TEST(RE2, Recursion) {
1117  // Test that recursion is stopped.
1118  // This test is PCRE-legacy -- there's no recursion in RE2.
1119  int bytes = 15 * 1024;  // enough to crash PCRE
1120  TestRecursion(bytes, ".");
1121  TestRecursion(bytes, "a");
1122  TestRecursion(bytes, "a.");
1123  TestRecursion(bytes, "ab.");
1124  TestRecursion(bytes, "abc.");
1125}
1126
1127TEST(RE2, BigCountedRepetition) {
1128  // Test that counted repetition works, given tons of memory.
1129  RE2::Options opt;
1130  opt.set_max_mem(256<<20);
1131
1132  RE2 re(".{512}x", opt);
1133  CHECK(re.ok());
1134  string s;
1135  s.append(515, 'c');
1136  s.append("x");
1137  CHECK(RE2::PartialMatch(s, re));
1138}
1139
1140TEST(RE2, DeepRecursion) {
1141  // Test for deep stack recursion.  This would fail with a
1142  // segmentation violation due to stack overflow before pcre was
1143  // patched.
1144  // Again, a PCRE legacy test.  RE2 doesn't recurse.
1145  string comment("x*");
1146  string a(131072, 'a');
1147  comment += a;
1148  comment += "*x";
1149  RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1150  CHECK(RE2::FullMatch(comment, re));
1151}
1152
1153// Suggested by Josh Hyman.  Failed when SearchOnePass was
1154// not implementing case-folding.
1155TEST(CaseInsensitive, MatchAndConsume) {
1156  string result;
1157  string text = "A fish named *Wanda*";
1158  StringPiece sp(text);
1159
1160  EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result));
1161  EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1162}
1163
1164// RE2 should permit implicit conversions from string, StringPiece, const char*,
1165// and C string literals.
1166TEST(RE2, ImplicitConversions) {
1167  string re_string(".");
1168  StringPiece re_stringpiece(".");
1169  const char* re_cstring = ".";
1170  EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1171  EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
1172  EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
1173  EXPECT_TRUE(RE2::PartialMatch("e", "."));
1174}
1175
1176// Bugs introduced by 8622304
1177TEST(RE2, CL8622304) {
1178  // reported by ingow
1179  string dir;
1180  EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])"));  // ok
1181  EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir));  // fails
1182
1183  // reported by jacobsa
1184  string key, val;
1185  EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1186              "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1187              &key,
1188              &val));
1189  EXPECT_EQ(key, "bar");
1190  EXPECT_EQ(val, "1,0x2F,030,4,5");
1191}
1192
1193
1194// Check that RE2 returns correct regexp pieces on error.
1195// In particular, make sure it returns whole runes
1196// and that it always reports invalid UTF-8.
1197// Also check that Perl error flag piece is big enough.
1198static struct ErrorTest {
1199  const char *regexp;
1200  const char *error;
1201} error_tests[] = {
1202  { "ab\\αcd", "\\α" },
1203  { "ef\\x☺01", "\\x☺0" },
1204  { "gh\\x1☺01", "\\x1☺" },
1205  { "ij\\x1", "\\x1" },
1206  { "kl\\x", "\\x" },
1207  { "uv\\x{0000☺}", "\\x{0000☺" },
1208  { "wx\\p{ABC", "\\p{ABC" },
1209  { "yz(?smiUX:abc)", "(?smiUX" },   // used to return (?s but the error is X
1210  { "aa(?sm☺i", "(?sm☺" },
1211  { "bb[abc", "[abc" },
1212
1213  { "mn\\x1\377", "" },  // no argument string returned for invalid UTF-8
1214  { "op\377qr", "" },
1215  { "st\\x{00000\377", "" },
1216  { "zz\\p{\377}", "" },
1217  { "zz\\x{00\377}", "" },
1218  { "zz(?P<name\377>abc)", "" },
1219};
1220TEST(RE2, ErrorArgs) {
1221  for (int i = 0; i < arraysize(error_tests); i++) {
1222    RE2 re(error_tests[i].regexp, RE2::Quiet);
1223    EXPECT_FALSE(re.ok());
1224    EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
1225  }
1226}
1227
1228// Check that "never match \n" mode never matches \n.
1229static struct NeverTest {
1230  const char* regexp;
1231  const char* text;
1232  const char* match;
1233} never_tests[] = {
1234  { "(.*)", "abc\ndef\nghi\n", "abc" },
1235  { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1236  { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1237  { "(abc[^x]*def)", "abc\ndef\n", NULL },
1238  { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1239};
1240TEST(RE2, NeverNewline) {
1241  RE2::Options opt;
1242  opt.set_never_nl(true);
1243  for (int i = 0; i < arraysize(never_tests); i++) {
1244    const NeverTest& t = never_tests[i];
1245    RE2 re(t.regexp, opt);
1246    if (t.match == NULL) {
1247      EXPECT_FALSE(re.PartialMatch(t.text, re));
1248    } else {
1249      StringPiece m;
1250      EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1251      EXPECT_EQ(m, t.match);
1252    }
1253  }
1254}
1255
1256// Check that there are no capturing groups in "never capture" mode.
1257TEST(RE2, NeverCapture) {
1258  RE2::Options opt;
1259  opt.set_never_capture(true);
1260  RE2 re("(r)(e)", opt);
1261  EXPECT_EQ(0, re.NumberOfCapturingGroups());
1262}
1263
1264// Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1265// Triggered by a failed DFA search falling back to Bitstate when
1266// using Match with a NULL submatch set.  Bitstate tried to read
1267// the submatch[0] entry even if nsubmatch was 0.
1268TEST(RE2, BitstateCaptureBug) {
1269  RE2::Options opt;
1270  opt.set_max_mem(20000);
1271  RE2 re("(_________$)", opt);
1272  StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1273  EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1274}
1275
1276// C++ version of bug 609710.
1277TEST(RE2, UnicodeClasses) {
1278  const string str = "ABCDEFGHI譚永鋒";
1279  string a, b, c;
1280
1281  EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1282  EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1283  EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1284  EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1285  EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1286  EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1287
1288  EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1289  EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1290  EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1291  EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1292  EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1293  EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1294
1295  EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1296  EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1297  EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1298  EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1299  EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1300  EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1301
1302  EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1303  EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1304  EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1305  EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1306  EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1307  EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1308
1309  EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1310  EXPECT_EQ("A", a);
1311  EXPECT_EQ("B", b);
1312  EXPECT_EQ("C", c);
1313
1314  EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1315  EXPECT_EQ("A", a);
1316  EXPECT_EQ("B", b);
1317  EXPECT_EQ("C", c);
1318
1319  EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1320
1321  EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1322  EXPECT_EQ("A", a);
1323  EXPECT_EQ("B", b);
1324  EXPECT_EQ("C", c);
1325
1326  EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1327
1328  EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1329  EXPECT_EQ("譚", a);
1330  EXPECT_EQ("永", b);
1331  EXPECT_EQ("鋒", c);
1332}
1333
1334// Bug reported by saito. 2009/02/17
1335TEST(RE2, NullVsEmptyString) {
1336  RE2 re2(".*");
1337  StringPiece v1("");
1338  EXPECT_TRUE(RE2::FullMatch(v1, re2));
1339
1340  StringPiece v2;
1341  EXPECT_TRUE(RE2::FullMatch(v2, re2));
1342}
1343
1344// Issue 1816809
1345TEST(RE2, Bug1816809) {
1346  RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1347  StringPiece piece("llx-3;llx4");
1348  string x;
1349  EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1350}
1351
1352// Issue 3061120
1353TEST(RE2, Bug3061120) {
1354  RE2 re("(?i)\\W");
1355  EXPECT_FALSE(RE2::PartialMatch("x", re));  // always worked
1356  EXPECT_FALSE(RE2::PartialMatch("k", re));  // broke because of kelvin
1357  EXPECT_FALSE(RE2::PartialMatch("s", re));  // broke because of latin long s
1358}
1359
1360TEST(RE2, CapturingGroupNames) {
1361  // Opening parentheses annotated with group IDs:
1362  //      12    3        45   6         7
1363  RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1364  EXPECT_TRUE(re.ok());
1365  const map<int, string>& have = re.CapturingGroupNames();
1366  map<int, string> want;
1367  want[3] = "G2";
1368  want[6] = "G2";
1369  want[7] = "G1";
1370  EXPECT_EQ(want, have);
1371}
1372
1373TEST(RE2, RegexpToStringLossOfAnchor) {
1374  EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1375  EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1376  EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1377  EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1378}
1379
1380}  // namespace re2
1381