10d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Copyright 2008 The RE2 Authors.  All Rights Reserved.
20d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Use of this source code is governed by a BSD-style
30d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// license that can be found in the LICENSE file.
40d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
50d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Exhaustive testing of regular expression matching.
60d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
70d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "util/test.h"
80d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/testing/exhaustive_tester.h"
90d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinnamespace re2 {
110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test simple character classes by themselves.
130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(CharacterClasses, Exhaustive) {
140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> atoms = Split(" ",
150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 5, Explode("ab"), "", "");
180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test simple character classes inside a___b (for example, a[a]b).
210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(CharacterClasses, ExhaustiveAB) {
220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> atoms = Split(" ",
230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 5, Explode("ab"), "a%sb", "");
260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Returns UTF8 for Rune r
290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic string UTF8(Rune r) {
300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  char buf[UTFmax+1];
310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  buf[runetochar(buf, &r)] = 0;
320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  return string(buf);
330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Returns a vector of "interesting" UTF8 characters.
360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Unicode is now too big to just return all of them,
370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// so UTF8Characters return a set likely to be good test cases.
380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic const vector<string>& InterestingUTF8() {
390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  static bool init;
400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  static vector<string> v;
410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  if (init)
430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    return v;
440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  init = true;
460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // All the Latin1 equivalents are interesting.
470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  for (int i = 1; i < 256; i++)
480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    v.push_back(UTF8(i));
490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // After that, the codes near bit boundaries are
510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // interesting, because they span byte sequence lengths.
520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  for (int j = 0; j < 8; j++)
530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    v.push_back(UTF8(256 + j));
540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  for (int i = 512; i < Runemax; i <<= 1)
550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    for (int j = -8; j < 8; j++)
560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin      v.push_back(UTF8(i + j));
570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // The codes near Runemax, including Runemax itself, are interesting.
590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  for (int j = -8; j <= 0; j++)
600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    v.push_back(UTF8(Runemax + j));
610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  return v;
630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test interesting UTF-8 characters against character classes.
660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(InterestingUTF8, SingleOps) {
670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> atoms = Split(" ",
680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> ops;  // no ops
730d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  ExhaustiveTest(1, 0, atoms, ops,
740d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 1, InterestingUTF8(), "", "");
750d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
760d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
770d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test interesting UTF-8 characters against character classes,
780d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// but wrap everything inside AB.
790d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(InterestingUTF8, AB) {
800d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> atoms = Split(" ",
810d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
820d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
830d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
840d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
850d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> ops;  // no ops
860d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> alpha = InterestingUTF8();
870d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  for (int i = 0; i < alpha.size(); i++)
880d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    alpha[i] = "a" + alpha[i] + "b";
890d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  ExhaustiveTest(1, 0, atoms, ops,
900d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 1, alpha, "a%sb", "");
910d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
920d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
930d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}  // namespace re2
940d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
95