compile_test.cc revision 0d4c52358a1af421705c54bd8a9fdd8a30558a2e
1// Copyright 2007 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Test prog.cc, compile.cc
6
7#include <string>
8#include <vector>
9#include "util/test.h"
10#include "re2/regexp.h"
11#include "re2/prog.h"
12
13DEFINE_string(show, "", "regular expression to compile and dump");
14
15namespace re2 {
16
17// Simple input/output tests checking that
18// the regexp compiles to the expected code.
19// These are just to sanity check the basic implementation.
20// The real confidence tests happen by testing the NFA/DFA
21// that run the compiled code.
22
23struct Test {
24  const char* regexp;
25  const char* code;
26};
27
28static Test tests[] = {
29  { "a",
30    "1. byte [61-61] -> 2\n"
31    "2. match! 0\n" },
32  { "ab",
33    "1. byte [61-61] -> 2\n"
34    "2. byte [62-62] -> 3\n"
35    "3. match! 0\n" },
36  { "a|c",
37    "3. alt -> 1 | 2\n"
38    "1. byte [61-61] -> 4\n"
39    "2. byte [63-63] -> 4\n"
40    "4. match! 0\n" },
41  { "a|b",
42    "1. byte [61-62] -> 2\n"
43    "2. match! 0\n" },
44  { "[ab]",
45    "1. byte [61-62] -> 2\n"
46    "2. match! 0\n" },
47  { "a+",
48    "1. byte [61-61] -> 2\n"
49    "2. alt -> 1 | 3\n"
50    "3. match! 0\n" },
51  { "a+?",
52    "1. byte [61-61] -> 2\n"
53    "2. alt -> 3 | 1\n"
54    "3. match! 0\n" },
55  { "a*",
56    "2. alt -> 1 | 3\n"
57    "1. byte [61-61] -> 2\n"
58    "3. match! 0\n" },
59  { "a*?",
60    "2. alt -> 3 | 1\n"
61    "3. match! 0\n"
62    "1. byte [61-61] -> 2\n" },
63  { "a?",
64    "2. alt -> 1 | 3\n"
65    "1. byte [61-61] -> 3\n"
66    "3. match! 0\n" },
67  { "a??",
68    "2. alt -> 3 | 1\n"
69    "3. match! 0\n"
70    "1. byte [61-61] -> 3\n" },
71  { "a{4}",
72    "1. byte [61-61] -> 2\n"
73    "2. byte [61-61] -> 3\n"
74    "3. byte [61-61] -> 4\n"
75    "4. byte [61-61] -> 5\n"
76    "5. match! 0\n" },
77  { "(a)",
78    "2. capture 2 -> 1\n"
79    "1. byte [61-61] -> 3\n"
80    "3. capture 3 -> 4\n"
81    "4. match! 0\n" },
82  { "(?:a)",
83    "1. byte [61-61] -> 2\n"
84    "2. match! 0\n" },
85  { "",
86    "2. match! 0\n" },
87  { ".",
88    "3. alt -> 1 | 2\n"
89    "1. byte [00-09] -> 4\n"
90    "2. byte [0b-ff] -> 4\n"
91    "4. match! 0\n" },
92  { "[^ab]",
93    "5. alt -> 3 | 4\n"
94    "3. alt -> 1 | 2\n"
95    "4. byte [63-ff] -> 6\n"
96    "1. byte [00-09] -> 6\n"
97    "2. byte [0b-60] -> 6\n"
98    "6. match! 0\n" },
99  { "[Aa]",
100    "1. byte/i [61-61] -> 2\n"
101    "2. match! 0\n" },
102};
103
104TEST(TestRegexpCompileToProg, Simple) {
105  int failed = 0;
106  for (int i = 0; i < arraysize(tests); i++) {
107    const re2::Test& t = tests[i];
108    Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
109    if (re == NULL) {
110      LOG(ERROR) << "Cannot parse: " << t.regexp;
111      failed++;
112      continue;
113    }
114    Prog* prog = re->CompileToProg(0);
115    if (prog == NULL) {
116      LOG(ERROR) << "Cannot compile: " << t.regexp;
117      re->Decref();
118      failed++;
119      continue;
120    }
121    CHECK(re->CompileToProg(1) == NULL);
122    string s = prog->Dump();
123    if (s != t.code) {
124      LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
125      LOG(ERROR) << "Want:\n" << t.code;
126      LOG(ERROR) << "Got:\n" << s;
127      failed++;
128    }
129    delete prog;
130    re->Decref();
131  }
132  EXPECT_EQ(failed, 0);
133}
134
135// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
136// Once, erroneously split between 0x3f and 0x40 because it is
137// a 6-bit boundary.
138static struct UTF8ByteRange {
139  int lo;
140  int hi;
141} utf8ranges[] = {
142  { 0x00, 0x09 },
143  { 0x0A, 0x0A },
144  { 0x10, 0x7F },
145  { 0x80, 0x8F },
146  { 0x90, 0x9F },
147  { 0xA0, 0xBF },
148  { 0xC0, 0xC1 },
149  { 0xC2, 0xDF },
150  { 0xE0, 0xE0 },
151  { 0xE1, 0xEF },
152  { 0xF0, 0xF0 },
153  { 0xF1, 0xF3 },
154  { 0xF4, 0xF4 },
155  { 0xF5, 0xFF },
156};
157
158TEST(TestCompile, ByteRanges) {
159  Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
160  EXPECT_TRUE(re != NULL);
161  Prog* prog = re->CompileToProg(0);
162  EXPECT_TRUE(prog != NULL);
163  EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
164  for (int i = 0; i < arraysize(utf8ranges); i++)
165    for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
166      EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
167  delete prog;
168  re->Decref();
169}
170
171}  // namespace re2
172