op_regex.cpp revision cc2ee177dbb3befca43e36cfc56778b006c3d050
1/** 2 * @file op_regex.cpp 3 * This file contains implementation for a lightweight wrapper around 4 * libc regex, providing regular expression match and replace facility. 5 * 6 * @remark Copyright 2003 OProfile authors 7 * @remark Read the file COPYING 8 * @remark Idea comes from TextFilt project <http://textfilt.sourceforge.net> 9 * 10 * @author Philippe Elie 11 */ 12 13#include <cerrno> 14 15#include <iostream> 16#include <fstream> 17 18#include "string_manip.h" 19 20#include "op_regex.h" 21 22using namespace std; 23 24namespace { 25 26string op_regerror(int err, regex_t const & regexp) 27{ 28 size_t needed_size = regerror(err, ®exp, 0, 0); 29 char * buffer = new char[needed_size]; 30 regerror(err, ®exp, buffer, needed_size); 31 32 return buffer; 33} 34 35 36void op_regcomp(regex_t & regexp, string const & pattern) 37{ 38 int err = regcomp(®exp, pattern.c_str(), REG_EXTENDED); 39 if (err) { 40 throw bad_regex("regcomp error: " + op_regerror(err, regexp) 41 + " for pattern : " + pattern); 42 } 43} 44 45 46bool op_regexec(regex_t const & regex, string const & str, regmatch_t * match, 47 size_t nmatch) 48{ 49 return regexec(®ex, str.c_str(), nmatch, match, 0) != REG_NOMATCH; 50} 51 52 53void op_regfree(regex_t & regexp) 54{ 55 regfree(®exp); 56} 57 58 59// return the index number associated with a char seen in a "\x". 60// Allowed range are for x is [0-9a-z] return size_t(-1) if x is not in 61// these ranges. 62size_t subexpr_index(char ch) 63{ 64 if (isdigit(ch)) 65 return ch - '0'; 66 if (ch >= 'a' && ch <= 'z') 67 return ch - 'a' + 10; 68 return size_t(-1); 69} 70 71} // anonymous namespace 72 73 74bad_regex::bad_regex(string const & pattern) 75 : op_exception(pattern) 76{ 77} 78 79 80regular_expression_replace::regular_expression_replace(size_t limit_, 81 size_t limit_defs) 82 : 83 limit(limit_), 84 limit_defs_expansion(limit_defs) 85{ 86} 87 88 89regular_expression_replace::~regular_expression_replace() 90{ 91 for (size_t i = 0 ; i < regex_replace.size() ; ++i) 92 op_regfree(regex_replace[i].regexp); 93} 94 95 96void regular_expression_replace::add_definition(string const & name, 97 string const & definition) 98{ 99 defs[name] = expand_string(definition); 100} 101 102 103void regular_expression_replace::add_pattern(string const & pattern, 104 string const & replace) 105{ 106 string expanded_pattern = expand_string(pattern); 107 108 regex_t regexp; 109 op_regcomp(regexp, expanded_pattern); 110 replace_t regex = { regexp, replace }; 111 regex_replace.push_back(regex); 112} 113 114 115string regular_expression_replace::expand_string(string const & input) 116{ 117 string last, expanded(input); 118 size_t i = 0; 119 for (i = 0 ; i < limit_defs_expansion ; ++i) { 120 last = expanded; 121 expanded = substitute_definition(last); 122 if (expanded == last) 123 break; 124 } 125 126 if (i == limit_defs_expansion) 127 throw bad_regex("too many substitution for: + input"); 128 129 return last; 130} 131 132 133string regular_expression_replace::substitute_definition(string const & pattern) 134{ 135 string result; 136 bool previous_is_escape = false; 137 138 for (size_t i = 0 ; i < pattern.length() ; ++i) { 139 if (pattern[i] == '$' && !previous_is_escape) { 140 size_t pos = pattern.find('{', i); 141 if (pos != i + 1) { 142 throw bad_regex("invalid $ in pattern: " + pattern); 143 } 144 size_t end = pattern.find('}', i); 145 if (end == string::npos) { 146 throw bad_regex("no matching '}' in pattern: " + pattern); 147 } 148 string def_name = pattern.substr(pos+1, (end-pos) - 1); 149 if (defs.find(def_name) == defs.end()) { 150 throw bad_regex("definition not found and used in pattern: (" + def_name + ") " + pattern); 151 } 152 result += defs[def_name]; 153 i = end; 154 } else { 155 if (pattern[i] == '\\' && !previous_is_escape) { 156 previous_is_escape = true; 157 } else { 158 previous_is_escape = false; 159 } 160 result += pattern[i]; 161 } 162 } 163 164 return result; 165} 166 167 168// FIXME limit output string size ? (cause we can have exponential growing 169// of output string through a rule "a" = "aa") 170bool regular_expression_replace::execute(string & str) const 171{ 172 bool changed = true; 173 for (size_t nr_iter = 0; changed && nr_iter < limit ; ++nr_iter) { 174 changed = false; 175 for (size_t i = 0 ; i < regex_replace.size() ; ++i) { 176 if (do_execute(str, regex_replace[i])) { 177 changed = true; 178 } 179 } 180 } 181 182 // this don't return if the input string has been changed but if 183 // we reach the limit number of iteration. 184 return changed == false; 185} 186 187 188bool regular_expression_replace::do_execute(string & str, 189 replace_t const & regexp) const 190{ 191 bool changed = false; 192 193 regmatch_t match[max_match]; 194 for (size_t iter = 0; 195 op_regexec(regexp.regexp, str, match, max_match) && iter < limit; 196 iter++) { 197 changed = true; 198 do_replace(str, regexp.replace, match); 199 } 200 201 return changed; 202} 203 204 205regmatch_t const & 206regular_expression_replace::get_match(regmatch_t const * match, char idx) const 207{ 208 size_t sub_expr = subexpr_index(idx); 209 if (sub_expr == size_t(-1)) 210 throw bad_regex("expect group index: " + idx); 211 if (sub_expr >= max_match) 212 throw bad_regex("illegal group index :" + idx); 213 return match[sub_expr]; 214} 215 216void regular_expression_replace::do_replace 217(string & str, string const & replace, regmatch_t const * match) const 218{ 219 string inserted; 220 for (size_t i = 0 ; i < replace.length() ; ++i) { 221 if (replace[i] == '\\') { 222 if (i == replace.length() - 1) { 223 throw bad_regex("illegal \\ trailer: " + 224 replace); 225 } 226 ++i; 227 if (replace[i] == '\\') { 228 inserted += '\\'; 229 } else { 230 regmatch_t const & matched = get_match(match, 231 replace[i]); 232 if (matched.rm_so == -1 && 233 matched.rm_eo == -1) { 234 // empty match: nothing todo 235 } else if (matched.rm_so == -1 || 236 matched.rm_eo == -1) { 237 throw bad_regex("illegal match: " + 238 replace); 239 } else { 240 inserted += str.substr(matched.rm_so, 241 matched.rm_eo - matched.rm_so); 242 } 243 } 244 } else { 245 inserted += replace[i]; 246 } 247 } 248 249 size_t first = match[0].rm_so; 250 size_t count = match[0].rm_eo - match[0].rm_so; 251 252 str.replace(first, count, inserted); 253} 254 255 256void setup_regex(regular_expression_replace & regex, 257 string const & filename) 258{ 259 ifstream in(filename.c_str()); 260 if (!in) { 261 throw op_runtime_error("Can't open file " + filename + 262 " for reading", errno); 263 } 264 265 regular_expression_replace var_name_rule; 266 var_name_rule.add_pattern("^\\$([_a-zA-Z][_a-zA-Z0-9]*)[ ]*=.*", "\\1"); 267 regular_expression_replace var_value_rule; 268 var_value_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1"); 269 270 regular_expression_replace left_rule; 271 left_rule.add_pattern("[ ]*\"(.*)\"[ ]*=.*", "\\1"); 272 regular_expression_replace right_rule; 273 right_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1"); 274 275 string line; 276 while (getline(in, line)) { 277 line = trim(line); 278 if (line.empty() || line[0] == '#') 279 continue; 280 281 string temp = line; 282 var_name_rule.execute(temp); 283 if (temp == line) { 284 string left = line; 285 left_rule.execute(left); 286 if (left == line) { 287 throw bad_regex("invalid input file: " + 288 '"' + line + '"'); 289 } 290 291 string right = line; 292 right_rule.execute(right); 293 if (right == line) { 294 throw bad_regex("invalid input file: " 295 + '"' + line + '"'); 296 } 297 298 regex.add_pattern(left, right); 299 } else { 300 // temp != line ==> var_name_rule succeed to substitute 301 // into temp the var_name present in line 302 string var_name = temp; 303 string var_value = line; 304 var_value_rule.execute(var_value); 305 if (var_value == line) { 306 throw bad_regex("invalid input file: " + 307 '"' + line + '"'); 308 } 309 310 regex.add_definition(var_name, var_value); 311 } 312 } 313} 314