1ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// 3ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// The LLVM Compiler Infrastructure 4ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// 5ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// This file is distributed under the University of Illinois Open Source 6ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// License. See LICENSE.TXT for details. 7ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// 8ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//===----------------------------------------------------------------------===// 9ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// 10ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// This file implements a POSIX regular expression matcher. 11ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin// 12ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin//===----------------------------------------------------------------------===// 1348ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner 14ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include "llvm/Support/Regex.h" 15d04a8d4b33ff316ca4cf961e06c9e312eff8e64fChandler Carruth#include "regex_impl.h" 16d04a8d4b33ff316ca4cf961e06c9e312eff8e64fChandler Carruth#include "llvm/ADT/SmallVector.h" 17ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include "llvm/Support/ErrorHandling.h" 18ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include "llvm/Support/raw_ostream.h" 19ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin#include <string> 20ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwinusing namespace llvm; 2148ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner 2238e59891ee4417a9be2f8146ce0ba3269e38ac21Benjamin KramerRegex::Regex(StringRef regex, unsigned Flags) { 23ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin unsigned flags = 0; 24528700863adefca8de461ce28a7d903729fb96b4Chris Lattner preg = new llvm_regex(); 25ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin preg->re_endp = regex.end(); 26ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin if (Flags & IgnoreCase) 27ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin flags |= REG_ICASE; 28ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin if (Flags & Newline) 29ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin flags |= REG_NEWLINE; 306b731486d4460e5f1088a6066c0081af048c1e45Eli Bendersky if (!(Flags & BasicRegex)) 316b731486d4460e5f1088a6066c0081af048c1e45Eli Bendersky flags |= REG_EXTENDED; 326b731486d4460e5f1088a6066c0081af048c1e45Eli Bendersky error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 33ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin} 34ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 3581f46d9ce1888308b33336f9bea72147430da36bChris LattnerRegex::~Regex() { 3636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (preg) { 3736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines llvm_regfree(preg); 3836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines delete preg; 3936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 4081f46d9ce1888308b33336f9bea72147430da36bChris Lattner} 4181f46d9ce1888308b33336f9bea72147430da36bChris Lattner 4248ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattnerbool Regex::isValid(std::string &Error) { 43ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin if (!error) 44ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin return true; 4581f46d9ce1888308b33336f9bea72147430da36bChris Lattner 46dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines size_t len = llvm_regerror(error, preg, nullptr, 0); 4748ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner 48783a0387c5eef62ff50950aa3e977b2652a3c3a5Alexey Samsonov Error.resize(len - 1); 4948ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner llvm_regerror(error, preg, &Error[0], len); 50ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin return false; 51ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin} 52ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 5381f46d9ce1888308b33336f9bea72147430da36bChris Lattner/// getNumMatches - In a valid regex, return the number of parenthesized 5481f46d9ce1888308b33336f9bea72147430da36bChris Lattner/// matches it contains. 5581f46d9ce1888308b33336f9bea72147430da36bChris Lattnerunsigned Regex::getNumMatches() const { 5681f46d9ce1888308b33336f9bea72147430da36bChris Lattner return preg->re_nsub; 57ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin} 58ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 5938e59891ee4417a9be2f8146ce0ba3269e38ac21Benjamin Kramerbool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 60ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 61ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 62ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin // pmatch needs to have at least one element. 63528700863adefca8de461ce28a7d903729fb96b4Chris Lattner SmallVector<llvm_regmatch_t, 8> pm; 64ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin pm.resize(nmatch > 0 ? nmatch : 1); 65ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin pm[0].rm_so = 0; 66ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin pm[0].rm_eo = String.size(); 67ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 68ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 69ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 70ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin if (rc == REG_NOMATCH) 71ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin return false; 72ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin if (rc != 0) { 73ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin // regexec can fail due to invalid pattern or running out of memory. 74ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin error = rc; 75ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin return false; 76ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin } 77ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 78ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin // There was a match. 79ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 80ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin if (Matches) { // match position requested 8181f46d9ce1888308b33336f9bea72147430da36bChris Lattner Matches->clear(); 8281f46d9ce1888308b33336f9bea72147430da36bChris Lattner 8348ba9ff3c44f33c65c26ecf5df306f4a428d2c26Chris Lattner for (unsigned i = 0; i != nmatch; ++i) { 84ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin if (pm[i].rm_so == -1) { 85ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin // this group didn't match 86ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin Matches->push_back(StringRef()); 87ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin continue; 88ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin } 896ff80b2281eea0f42458bbf9790d5e340d9d7797Chris Lattner assert(pm[i].rm_eo >= pm[i].rm_so); 90ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin Matches->push_back(StringRef(String.data()+pm[i].rm_so, 91ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin pm[i].rm_eo-pm[i].rm_so)); 92ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin } 93ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin } 94ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin 95ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin return true; 96ce0c81e7dd321e9f94f628daa5528f56cab0ab88Torok Edwin} 97d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 98d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbarstd::string Regex::sub(StringRef Repl, StringRef String, 99d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar std::string *Error) { 100d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar SmallVector<StringRef, 8> Matches; 101d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 102d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Reset error, if given. 103d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar if (Error && !Error->empty()) *Error = ""; 104d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 105d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Return the input if there was no match. 106d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar if (!match(String, &Matches)) 107d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar return String; 108d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 109d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Otherwise splice in the replacement string, starting with the prefix before 110d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // the match. 111d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar std::string Res(String.begin(), Matches[0].begin()); 112d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 113d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Then the replacement string, honoring possible substitutions. 114d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar while (!Repl.empty()) { 115d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Skip to the next escape. 116d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar std::pair<StringRef, StringRef> Split = Repl.split('\\'); 117d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 118d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Add the skipped substring. 119d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Res += Split.first; 120d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 121d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Check for terminimation and trailing backslash. 122d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar if (Split.second.empty()) { 123d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar if (Repl.size() != Split.first.size() && 124d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Error && Error->empty()) 125d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar *Error = "replacement string contained trailing backslash"; 126d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar break; 127d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar } 128d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 129d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Otherwise update the replacement string and interpret escapes. 130d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Repl = Split.second; 131d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 132d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // FIXME: We should have a StringExtras function for mapping C99 escapes. 133d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar switch (Repl[0]) { 134d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Treat all unrecognized characters as self-quoting. 135d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar default: 136d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Res += Repl[0]; 137d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Repl = Repl.substr(1); 138d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar break; 139d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 140d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Single character escapes. 141d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar case 't': 142d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Res += '\t'; 143d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Repl = Repl.substr(1); 144d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar break; 145d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar case 'n': 146d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Res += '\n'; 147d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Repl = Repl.substr(1); 148d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar break; 149d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 150d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Decimal escapes are backreferences. 151d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar case '0': case '1': case '2': case '3': case '4': 152d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar case '5': case '6': case '7': case '8': case '9': { 153d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // Extract the backreference number. 154d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 155d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Repl = Repl.substr(Ref.size()); 156d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 157d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar unsigned RefValue; 158d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar if (!Ref.getAsInteger(10, RefValue) && 159d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar RefValue < Matches.size()) 160d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Res += Matches[RefValue]; 161d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar else if (Error && Error->empty()) 162d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar *Error = "invalid backreference string '" + Ref.str() + "'"; 163d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar break; 164d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar } 165d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar } 166d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar } 167d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 168d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar // And finally the suffix. 169d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 170d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar 171d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar return Res; 172d2a5c0d8562407f9acab97451a785b513edd4c9bDaniel Dunbar} 173aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne 17436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// These are the special characters matched in functions like "p_ere_exp". 17536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 17636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 177aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbournebool Regex::isLiteralERE(StringRef Str) { 178aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne // Check for regex metacharacters. This list was derived from our regex 179aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne // implementation in regcomp.c and double checked against the POSIX extended 180aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne // regular expression specification. 18136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return Str.find_first_of(RegexMetachars) == StringRef::npos; 18236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 18336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 18436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstd::string Regex::escape(StringRef String) { 18536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines std::string RegexStr; 18636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines for (unsigned i = 0, e = String.size(); i != e; ++i) { 18736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (strchr(RegexMetachars, String[i])) 18836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines RegexStr += '\\'; 18936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines RegexStr += String[i]; 19036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 19136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 19236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return RegexStr; 193aa80e61b0d79ddf9593f6217063574d0c66c3099Peter Collingbourne} 194