119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//===- StringMatcher.cpp - Generate a matcher for input strings -----------===//
219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//
319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//                     The LLVM Compiler Infrastructure
419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//
519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// This file is distributed under the University of Illinois Open Source
619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// License. See LICENSE.TXT for details.
719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//
819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//===----------------------------------------------------------------------===//
919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//
1019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman// This file implements the StringMatcher class.
1119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//
1219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman//===----------------------------------------------------------------------===//
1319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
1419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman#include "StringMatcher.h"
1519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman#include "llvm/Support/raw_ostream.h"
1619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman#include <map>
1719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanusing namespace llvm;
1819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
1919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// FindFirstNonCommonLetter - Find the first character in the keys of the
2019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// string pairs that is not shared across the whole set of strings.  All
2119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// strings are assumed to have the same length.
2219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanstatic unsigned
2319bac1e08be200c31efd26f0f5fd144c9b3eefd3John BaumanFindFirstNonCommonLetter(const std::vector<const
2419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman                              StringMatcher::StringPair*> &Matches) {
2519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  assert(!Matches.empty());
2619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  for (unsigned i = 0, e = Matches[0]->first.size(); i != e; ++i) {
2719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    // Check to see if letter i is the same across the set.
2819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    char Letter = Matches[0]->first[i];
2919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
3019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    for (unsigned str = 0, e = Matches.size(); str != e; ++str)
3119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      if (Matches[str]->first[i] != Letter)
3219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman        return i;
3319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  }
3419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
3519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  return Matches[0]->first.size();
3619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman}
3719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
3819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// EmitStringMatcherForChar - Given a set of strings that are known to be the
3919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// same length and whose characters leading up to CharNo are the same, emit
4019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// code to verify that CharNo and later are the same.
4119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman///
4219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// \return - True if control can leave the emitted code fragment.
4319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanbool StringMatcher::
4419bac1e08be200c31efd26f0f5fd144c9b3eefd3John BaumanEmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
4519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman                         unsigned CharNo, unsigned IndentCount) const {
4619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  assert(!Matches.empty() && "Must have at least one string to match!");
4719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  std::string Indent(IndentCount*2+4, ' ');
4819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
4919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // If we have verified that the entire string matches, we're done: output the
5019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // matching code.
5119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  if (CharNo == Matches[0]->first.size()) {
5219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    assert(Matches.size() == 1 && "Had duplicate keys to match on");
5319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
5419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    // If the to-execute code has \n's in it, indent each subsequent line.
5519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    StringRef Code = Matches[0]->second;
5619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
5719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    std::pair<StringRef, StringRef> Split = Code.split('\n');
5819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    OS << Indent << Split.first << "\t // \"" << Matches[0]->first << "\"\n";
5919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
6019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    Code = Split.second;
6119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    while (!Code.empty()) {
6219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      Split = Code.split('\n');
6319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS << Indent << Split.first << "\n";
6419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      Code = Split.second;
6519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    }
6619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    return false;
6719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  }
6819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
6919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // Bucket the matches by the character we are comparing.
7019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  std::map<char, std::vector<const StringPair*> > MatchesByLetter;
7119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
7219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
7319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
7419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
7519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
7619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // If we have exactly one bucket to match, see how many characters are common
7719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // across the whole set and match all of them at once.
7819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  if (MatchesByLetter.size() == 1) {
7919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches);
8019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    unsigned NumChars = FirstNonCommonLetter-CharNo;
8119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
8219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    // Emit code to break out if the prefix doesn't match.
8319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    if (NumChars == 1) {
8419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      // Do the comparison with if (Str[1] != 'f')
8519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      // FIXME: Need to escape general characters.
8619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] != '"
8719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      << Matches[0]->first[CharNo] << "')\n";
8819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS << Indent << "  break;\n";
8919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    } else {
9019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      // Do the comparison with if (Str.substr(1, 3) != "foo").
9119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      // FIXME: Need to escape general strings.
9219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS << Indent << "if (" << StrVariableName << ".substr(" << CharNo << ", "
9319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      << NumChars << ") != \"";
9419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS << Matches[0]->first.substr(CharNo, NumChars) << "\")\n";
9519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS << Indent << "  break;\n";
9619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    }
9719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
9819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    return EmitStringMatcherForChar(Matches, FirstNonCommonLetter, IndentCount);
9919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  }
10019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
10119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // Otherwise, we have multiple possible things, emit a switch on the
10219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // character.
10319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n";
10419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  OS << Indent << "default: break;\n";
10519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
10619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  for (std::map<char, std::vector<const StringPair*> >::iterator LI =
10719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman       MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) {
10819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    // TODO: escape hard stuff (like \n) if we ever care about it.
10919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    OS << Indent << "case '" << LI->first << "':\t // "
11019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman       << LI->second.size() << " string";
11119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    if (LI->second.size() != 1) OS << 's';
11219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    OS << " to match.\n";
11319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    if (EmitStringMatcherForChar(LI->second, CharNo+1, IndentCount+1))
11419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS << Indent << "  break;\n";
11519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  }
11619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
11719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  OS << Indent << "}\n";
11819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  return true;
11919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman}
12019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
12119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
12219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman/// Emit - Top level entry point.
12319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman///
12419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Baumanvoid StringMatcher::Emit(unsigned Indent) const {
12519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // If nothing to match, just fall through.
12619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  if (Matches.empty()) return;
12719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
12819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // First level categorization: group strings by length.
12919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  std::map<unsigned, std::vector<const StringPair*> > MatchesByLength;
13019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
13119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
13219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
13319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
13419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // Output a switch statement on length and categorize the elements within each
13519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  // bin.
13619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n";
13719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  OS.indent(Indent*2+2) << "default: break;\n";
13819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
13919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI =
14019bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman       MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) {
14119bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    OS.indent(Indent*2+2) << "case " << LI->first << ":\t // "
14219bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman       << LI->second.size()
14319bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman       << " string" << (LI->second.size() == 1 ? "" : "s") << " to match.\n";
14419bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman    if (EmitStringMatcherForChar(LI->second, 0, Indent))
14519bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman      OS.indent(Indent*2+4) << "break;\n";
14619bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  }
14719bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman
14819bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman  OS.indent(Indent*2+2) << "}\n";
14919bac1e08be200c31efd26f0f5fd144c9b3eefd3John Bauman}
150