exhaustive3_test.cc revision 0d4c52358a1af421705c54bd8a9fdd8a30558a2e
1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Exhaustive testing of regular expression matching.
6
7#include "util/test.h"
8#include "re2/testing/exhaustive_tester.h"
9
10namespace re2 {
11
12// Test simple character classes by themselves.
13TEST(CharacterClasses, Exhaustive) {
14  vector<string> atoms = Split(" ",
15    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
16  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
17                 5, Explode("ab"), "", "");
18}
19
20// Test simple character classes inside a___b (for example, a[a]b).
21TEST(CharacterClasses, ExhaustiveAB) {
22  vector<string> atoms = Split(" ",
23    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
24  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
25                 5, Explode("ab"), "a%sb", "");
26}
27
28// Returns UTF8 for Rune r
29static string UTF8(Rune r) {
30  char buf[UTFmax+1];
31  buf[runetochar(buf, &r)] = 0;
32  return string(buf);
33}
34
35// Returns a vector of "interesting" UTF8 characters.
36// Unicode is now too big to just return all of them,
37// so UTF8Characters return a set likely to be good test cases.
38static const vector<string>& InterestingUTF8() {
39  static bool init;
40  static vector<string> v;
41
42  if (init)
43    return v;
44
45  init = true;
46  // All the Latin1 equivalents are interesting.
47  for (int i = 1; i < 256; i++)
48    v.push_back(UTF8(i));
49
50  // After that, the codes near bit boundaries are
51  // interesting, because they span byte sequence lengths.
52  for (int j = 0; j < 8; j++)
53    v.push_back(UTF8(256 + j));
54  for (int i = 512; i < Runemax; i <<= 1)
55    for (int j = -8; j < 8; j++)
56      v.push_back(UTF8(i + j));
57
58  // The codes near Runemax, including Runemax itself, are interesting.
59  for (int j = -8; j <= 0; j++)
60    v.push_back(UTF8(Runemax + j));
61
62  return v;
63}
64
65// Test interesting UTF-8 characters against character classes.
66TEST(InterestingUTF8, SingleOps) {
67  vector<string> atoms = Split(" ",
68    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
69    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
70    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
71    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
72  vector<string> ops;  // no ops
73  ExhaustiveTest(1, 0, atoms, ops,
74                 1, InterestingUTF8(), "", "");
75}
76
77// Test interesting UTF-8 characters against character classes,
78// but wrap everything inside AB.
79TEST(InterestingUTF8, AB) {
80  vector<string> atoms = Split(" ",
81    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
82    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
83    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
84    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
85  vector<string> ops;  // no ops
86  vector<string> alpha = InterestingUTF8();
87  for (int i = 0; i < alpha.size(); i++)
88    alpha[i] = "a" + alpha[i] + "b";
89  ExhaustiveTest(1, 0, atoms, ops,
90                 1, alpha, "a%sb", "");
91}
92
93}  // namespace re2
94
95