119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//===- StringMatcher.cpp - Generate a matcher for input strings -----------===// 219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// 319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// The LLVM Compiler Infrastructure 419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// 519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// This file is distributed under the University of Illinois Open Source 619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// License. See LICENSE.TXT for details. 719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// 819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//===----------------------------------------------------------------------===// 919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// 1019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// This file implements the StringMatcher class. 1119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// 1219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//===----------------------------------------------------------------------===// 1319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 1419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman#include "StringMatcher.h" 1519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman#include "llvm/Support/raw_ostream.h" 1619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman#include <map> 1719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanusing namespace llvm; 1819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 1919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// FindFirstNonCommonLetter - Find the first character in the keys of the 2019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// string pairs that is not shared across the whole set of strings. All 2119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// strings are assumed to have the same length. 2219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanstatic unsigned 2319bac1e08be200c31efd26f0f5fd144c9b3eefd3John BaumanFindFirstNonCommonLetter(const std::vector<const 2419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman StringMatcher::StringPair*> &Matches) { 2519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman assert(!Matches.empty()); 2619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman for (unsigned i = 0, e = Matches[0]->first.size(); i != e; ++i) { 2719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // Check to see if letter i is the same across the set. 2819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman char Letter = Matches[0]->first[i]; 2919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 3019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman for (unsigned str = 0, e = Matches.size(); str != e; ++str) 3119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (Matches[str]->first[i] != Letter) 3219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman return i; 3319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } 3419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 3519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman return Matches[0]->first.size(); 3619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman} 3719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 3819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// EmitStringMatcherForChar - Given a set of strings that are known to be the 3919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// same length and whose characters leading up to CharNo are the same, emit 4019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// code to verify that CharNo and later are the same. 4119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// 4219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// \return - True if control can leave the emitted code fragment. 4319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanbool StringMatcher:: 4419bac1e08be200c31efd26f0f5fd144c9b3eefd3John BaumanEmitStringMatcherForChar(const std::vector<const StringPair*> &Matches, 4519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman unsigned CharNo, unsigned IndentCount) const { 4619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman assert(!Matches.empty() && "Must have at least one string to match!"); 4719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman std::string Indent(IndentCount*2+4, ' '); 4819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 4919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // If we have verified that the entire string matches, we're done: output the 5019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // matching code. 5119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (CharNo == Matches[0]->first.size()) { 5219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman assert(Matches.size() == 1 && "Had duplicate keys to match on"); 5319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 5419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // If the to-execute code has \n's in it, indent each subsequent line. 5519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman StringRef Code = Matches[0]->second; 5619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 5719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman std::pair<StringRef, StringRef> Split = Code.split('\n'); 5819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << Split.first << "\t // \"" << Matches[0]->first << "\"\n"; 5919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 6019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman Code = Split.second; 6119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman while (!Code.empty()) { 6219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman Split = Code.split('\n'); 6319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << Split.first << "\n"; 6419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman Code = Split.second; 6519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } 6619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman return false; 6719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } 6819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 6919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // Bucket the matches by the character we are comparing. 7019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman std::map<char, std::vector<const StringPair*> > MatchesByLetter; 7119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 7219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman for (unsigned i = 0, e = Matches.size(); i != e; ++i) 7319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]); 7419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 7519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 7619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // If we have exactly one bucket to match, see how many characters are common 7719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // across the whole set and match all of them at once. 7819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (MatchesByLetter.size() == 1) { 7919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches); 8019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman unsigned NumChars = FirstNonCommonLetter-CharNo; 8119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 8219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // Emit code to break out if the prefix doesn't match. 8319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (NumChars == 1) { 8419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // Do the comparison with if (Str[1] != 'f') 8519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // FIXME: Need to escape general characters. 8619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] != '" 8719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman << Matches[0]->first[CharNo] << "')\n"; 8819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << " break;\n"; 8919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } else { 9019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // Do the comparison with if (Str.substr(1, 3) != "foo"). 9119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // FIXME: Need to escape general strings. 9219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << "if (" << StrVariableName << ".substr(" << CharNo << ", " 9319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman << NumChars << ") != \""; 9419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Matches[0]->first.substr(CharNo, NumChars) << "\")\n"; 9519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << " break;\n"; 9619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } 9719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 9819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman return EmitStringMatcherForChar(Matches, FirstNonCommonLetter, IndentCount); 9919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } 10019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 10119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // Otherwise, we have multiple possible things, emit a switch on the 10219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // character. 10319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n"; 10419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << "default: break;\n"; 10519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 10619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman for (std::map<char, std::vector<const StringPair*> >::iterator LI = 10719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) { 10819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // TODO: escape hard stuff (like \n) if we ever care about it. 10919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << "case '" << LI->first << "':\t // " 11019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman << LI->second.size() << " string"; 11119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (LI->second.size() != 1) OS << 's'; 11219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << " to match.\n"; 11319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (EmitStringMatcherForChar(LI->second, CharNo+1, IndentCount+1)) 11419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << " break;\n"; 11519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } 11619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 11719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS << Indent << "}\n"; 11819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman return true; 11919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman} 12019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 12119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 12219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// Emit - Top level entry point. 12319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// 12419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanvoid StringMatcher::Emit(unsigned Indent) const { 12519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // If nothing to match, just fall through. 12619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (Matches.empty()) return; 12719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 12819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // First level categorization: group strings by length. 12919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman std::map<unsigned, std::vector<const StringPair*> > MatchesByLength; 13019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 13119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman for (unsigned i = 0, e = Matches.size(); i != e; ++i) 13219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]); 13319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 13419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // Output a switch statement on length and categorize the elements within each 13519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman // bin. 13619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n"; 13719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS.indent(Indent*2+2) << "default: break;\n"; 13819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 13919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI = 14019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) { 14119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS.indent(Indent*2+2) << "case " << LI->first << ":\t // " 14219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman << LI->second.size() 14319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman << " string" << (LI->second.size() == 1 ? "" : "s") << " to match.\n"; 14419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman if (EmitStringMatcherForChar(LI->second, 0, Indent)) 14519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS.indent(Indent*2+4) << "break;\n"; 14619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman } 14719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman 14819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman OS.indent(Indent*2+2) << "}\n"; 14919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman} 150