10d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Copyright 2008 The RE2 Authors. All Rights Reserved. 20d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Use of this source code is governed by a BSD-style 30d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// license that can be found in the LICENSE file. 40d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 50d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Exhaustive testing of regular expression matching. 60d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 70d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "util/test.h" 80d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/testing/exhaustive_tester.h" 90d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinnamespace re2 { 110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test simple character classes by themselves. 130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(CharacterClasses, Exhaustive) { 140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> atoms = Split(" ", 150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); 160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), 170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 5, Explode("ab"), "", ""); 180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test simple character classes inside a___b (for example, a[a]b). 210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(CharacterClasses, ExhaustiveAB) { 220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> atoms = Split(" ", 230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); 240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), 250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 5, Explode("ab"), "a%sb", ""); 260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Returns UTF8 for Rune r 290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic string UTF8(Rune r) { 300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin char buf[UTFmax+1]; 310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin buf[runetochar(buf, &r)] = 0; 320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return string(buf); 330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Returns a vector of "interesting" UTF8 characters. 360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Unicode is now too big to just return all of them, 370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// so UTF8Characters return a set likely to be good test cases. 380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic const vector<string>& InterestingUTF8() { 390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin static bool init; 400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin static vector<string> v; 410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (init) 430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return v; 440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin init = true; 460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // All the Latin1 equivalents are interesting. 470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 1; i < 256; i++) 480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin v.push_back(UTF8(i)); 490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // After that, the codes near bit boundaries are 510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // interesting, because they span byte sequence lengths. 520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int j = 0; j < 8; j++) 530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin v.push_back(UTF8(256 + j)); 540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 512; i < Runemax; i <<= 1) 550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int j = -8; j < 8; j++) 560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin v.push_back(UTF8(i + j)); 570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // The codes near Runemax, including Runemax itself, are interesting. 590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int j = -8; j <= 0; j++) 600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin v.push_back(UTF8(Runemax + j)); 610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return v; 630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test interesting UTF-8 characters against character classes. 660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(InterestingUTF8, SingleOps) { 670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> atoms = Split(" ", 680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " 690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " 700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " 710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); 720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> ops; // no ops 730d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(1, 0, atoms, ops, 740d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1, InterestingUTF8(), "", ""); 750d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 760d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 770d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test interesting UTF-8 characters against character classes, 780d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// but wrap everything inside AB. 790d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(InterestingUTF8, AB) { 800d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> atoms = Split(" ", 810d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " 820d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " 830d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " 840d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); 850d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> ops; // no ops 860d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> alpha = InterestingUTF8(); 870d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < alpha.size(); i++) 880d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin alpha[i] = "a" + alpha[i] + "b"; 890d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(1, 0, atoms, ops, 900d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1, alpha, "a%sb", ""); 910d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 920d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 930d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} // namespace re2 940d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 95