1ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//===-- Regex.cpp - Regular Expression matcher implementation -------------===//
2ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//
3ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//                     The LLVM Compiler Infrastructure
4ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//
5ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// This file is distributed under the University of Illinois Open Source
6ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// License. See LICENSE.TXT for details.
7ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//
8ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//===----------------------------------------------------------------------===//
9ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//
10ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// This file implements a POSIX regular expression matcher.
11ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//
12ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//===----------------------------------------------------------------------===//
1348ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner
14ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include "llvm/Support/Regex.h"
15d04a8d4b33ff316ca4cf961e06c9e312eff8e64fChandler Carruth#include "regex_impl.h"
16d04a8d4b33ff316ca4cf961e06c9e312eff8e64fChandler Carruth#include "llvm/ADT/SmallVector.h"
17ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include "llvm/Support/ErrorHandling.h"
18ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include "llvm/Support/raw_ostream.h"
19ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include <string>
20ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwinusing namespace llvm;
2148ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner
2238e59891ee4417a9be2f8146ce0ba3269e38ac21Benjamin KramerRegex::Regex(StringRef regex, unsigned Flags) {
23ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  unsigned flags = 0;
24528700863adefca8de461ce28a7d903729fb96b4Chris Lattner  preg = new llvm_regex();
25ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  preg->re_endp = regex.end();
26ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  if (Flags & IgnoreCase)
27ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    flags |= REG_ICASE;
28ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  if (Flags & Newline)
29ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    flags |= REG_NEWLINE;
306b731486d4460e5f1088a6066c0081af048c1e45Eli Bendersky  if (!(Flags & BasicRegex))
316b731486d4460e5f1088a6066c0081af048c1e45Eli Bendersky    flags |= REG_EXTENDED;
326b731486d4460e5f1088a6066c0081af048c1e45Eli Bendersky  error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
33ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin}
34ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
3581f46d9ce1888308b33336f9bea72147430da36bChris LattnerRegex::~Regex() {
3636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  if (preg) {
3736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    llvm_regfree(preg);
3836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    delete preg;
3936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  }
4081f46d9ce1888308b33336f9bea72147430da36bChris Lattner}
4181f46d9ce1888308b33336f9bea72147430da36bChris Lattner
4248ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattnerbool Regex::isValid(std::string &Error) {
43ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  if (!error)
44ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    return true;
4581f46d9ce1888308b33336f9bea72147430da36bChris Lattner
46dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines  size_t len = llvm_regerror(error, preg, nullptr, 0);
4748ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner
48783a0387c5eef62ff50950aa3e977b2652a3c3a5Alexey Samsonov  Error.resize(len - 1);
4948ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner  llvm_regerror(error, preg, &Error[0], len);
50ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  return false;
51ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin}
52ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
5381f46d9ce1888308b33336f9bea72147430da36bChris Lattner/// getNumMatches - In a valid regex, return the number of parenthesized
5481f46d9ce1888308b33336f9bea72147430da36bChris Lattner/// matches it contains.
5581f46d9ce1888308b33336f9bea72147430da36bChris Lattnerunsigned Regex::getNumMatches() const {
5681f46d9ce1888308b33336f9bea72147430da36bChris Lattner  return preg->re_nsub;
57ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin}
58ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
5938e59891ee4417a9be2f8146ce0ba3269e38ac21Benjamin Kramerbool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
60ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
61ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
62ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  // pmatch needs to have at least one element.
63528700863adefca8de461ce28a7d903729fb96b4Chris Lattner  SmallVector<llvm_regmatch_t, 8> pm;
64ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  pm.resize(nmatch > 0 ? nmatch : 1);
65ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  pm[0].rm_so = 0;
66ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  pm[0].rm_eo = String.size();
67ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
68ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
69ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
70ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  if (rc == REG_NOMATCH)
71ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    return false;
72ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  if (rc != 0) {
73ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    // regexec can fail due to invalid pattern or running out of memory.
74ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    error = rc;
75ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    return false;
76ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  }
77ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
78ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  // There was a match.
79ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
80ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  if (Matches) { // match position requested
8181f46d9ce1888308b33336f9bea72147430da36bChris Lattner    Matches->clear();
8281f46d9ce1888308b33336f9bea72147430da36bChris Lattner
8348ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner    for (unsigned i = 0; i != nmatch; ++i) {
84ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin      if (pm[i].rm_so == -1) {
85ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin        // this group didn't match
86ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin        Matches->push_back(StringRef());
87ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin        continue;
88ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin      }
896ff80b2281eea0f42458bbf9790d5e340d9d7797Chris Lattner      assert(pm[i].rm_eo >= pm[i].rm_so);
90ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin      Matches->push_back(StringRef(String.data()+pm[i].rm_so,
91ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin                                   pm[i].rm_eo-pm[i].rm_so));
92ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin    }
93ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  }
94ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin
95ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin  return true;
96ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin}
97d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
98d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbarstd::string Regex::sub(StringRef Repl, StringRef String,
99d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar                       std::string *Error) {
100d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  SmallVector<StringRef, 8> Matches;
101d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
102d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  // Reset error, if given.
103d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  if (Error && !Error->empty()) *Error = "";
104d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
105d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  // Return the input if there was no match.
106d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  if (!match(String, &Matches))
107d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    return String;
108d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
109d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  // Otherwise splice in the replacement string, starting with the prefix before
110d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  // the match.
111d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  std::string Res(String.begin(), Matches[0].begin());
112d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
113d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  // Then the replacement string, honoring possible substitutions.
114d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  while (!Repl.empty()) {
115d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    // Skip to the next escape.
116d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    std::pair<StringRef, StringRef> Split = Repl.split('\\');
117d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
118d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    // Add the skipped substring.
119d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    Res += Split.first;
120d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
121d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    // Check for terminimation and trailing backslash.
122d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    if (Split.second.empty()) {
123d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      if (Repl.size() != Split.first.size() &&
124d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar          Error && Error->empty())
125d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar        *Error = "replacement string contained trailing backslash";
126d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      break;
127d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    }
128d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
129d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    // Otherwise update the replacement string and interpret escapes.
130d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    Repl = Split.second;
131d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
132d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    // FIXME: We should have a StringExtras function for mapping C99 escapes.
133d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    switch (Repl[0]) {
134d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      // Treat all unrecognized characters as self-quoting.
135d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    default:
136d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      Res += Repl[0];
137d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      Repl = Repl.substr(1);
138d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      break;
139d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
140d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      // Single character escapes.
141d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    case 't':
142d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      Res += '\t';
143d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      Repl = Repl.substr(1);
144d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      break;
145d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    case 'n':
146d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      Res += '\n';
147d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      Repl = Repl.substr(1);
148d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      break;
149d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
150d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      // Decimal escapes are backreferences.
151d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    case '0': case '1': case '2': case '3': case '4':
152d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    case '5': case '6': case '7': case '8': case '9': {
153d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      // Extract the backreference number.
154d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
155d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      Repl = Repl.substr(Ref.size());
156d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
157d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      unsigned RefValue;
158d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      if (!Ref.getAsInteger(10, RefValue) &&
159d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar          RefValue < Matches.size())
160d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar        Res += Matches[RefValue];
161d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      else if (Error && Error->empty())
162d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar        *Error = "invalid backreference string '" + Ref.str() + "'";
163d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar      break;
164d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    }
165d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar    }
166d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  }
167d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
168d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  // And finally the suffix.
169d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
170d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar
171d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar  return Res;
172d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar}
173aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne
17436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// These are the special characters matched in functions like "p_ere_exp".
17536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic const char RegexMetachars[] = "()^$|*+?.[]\\{}";
17636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
177aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbournebool Regex::isLiteralERE(StringRef Str) {
178aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne  // Check for regex metacharacters.  This list was derived from our regex
179aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne  // implementation in regcomp.c and double checked against the POSIX extended
180aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne  // regular expression specification.
18136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  return Str.find_first_of(RegexMetachars) == StringRef::npos;
18236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines}
18336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
18436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstd::string Regex::escape(StringRef String) {
18536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  std::string RegexStr;
18636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  for (unsigned i = 0, e = String.size(); i != e; ++i) {
18736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (strchr(RegexMetachars, String[i]))
18836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      RegexStr += '\\';
18936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    RegexStr += String[i];
19036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  }
19136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
19236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  return RegexStr;
193aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne}
194