1// class template regex -*- C++ -*- 2 3// Copyright (C) 2010, 2011 Free Software Foundation, Inc. 4// 5// This file is part of the GNU ISO C++ Library. This library is free 6// software; you can redistribute it and/or modify it under the 7// terms of the GNU General Public License as published by the 8// Free Software Foundation; either version 3, or (at your option) 9// any later version. 10 11// This library is distributed in the hope that it will be useful, 12// but WITHOUT ANY WARRANTY; without even the implied warranty of 13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14// GNU General Public License for more details. 15 16// Under Section 7 of GPL version 3, you are granted additional 17// permissions described in the GCC Runtime Library Exception, version 18// 3.1, as published by the Free Software Foundation. 19 20// You should have received a copy of the GNU General Public License and 21// a copy of the GCC Runtime Library Exception along with this program; 22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23// <http://www.gnu.org/licenses/>. 24 25/** 26 * @file bits/regex_nfa.h 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31namespace std _GLIBCXX_VISIBILITY(default) 32{ 33namespace __regex 34{ 35_GLIBCXX_BEGIN_NAMESPACE_VERSION 36 37 // Base class for, um, automata. Could be an NFA or a DFA. Your choice. 38 class _Automaton 39 { 40 public: 41 typedef unsigned int _SizeT; 42 43 public: 44 virtual 45 ~_Automaton() { } 46 47 virtual _SizeT 48 _M_sub_count() const = 0; 49 50#ifdef _GLIBCXX_DEBUG 51 virtual std::ostream& 52 _M_dot(std::ostream& __ostr) const = 0; 53#endif 54 }; 55 56 // Generic shared pointer to an automaton. 57 typedef std::shared_ptr<_Automaton> _AutomatonPtr; 58 59 // Operation codes that define the type of transitions within the base NFA 60 // that represents the regular expression. 61 enum _Opcode 62 { 63 _S_opcode_unknown = 0, 64 _S_opcode_alternative = 1, 65 _S_opcode_subexpr_begin = 4, 66 _S_opcode_subexpr_end = 5, 67 _S_opcode_match = 100, 68 _S_opcode_accept = 255 69 }; 70 71 // Provides a generic facade for a templated match_results. 72 struct _Results 73 { 74 virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0; 75 virtual void _M_set_matched(int __i, bool __is_matched) = 0; 76 }; 77 78 // Tags current state (for subexpr begin/end). 79 typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger; 80 81 template<typename _FwdIterT, typename _TraitsT> 82 struct _StartTagger 83 { 84 explicit 85 _StartTagger(int __i) 86 : _M_index(__i) 87 { } 88 89 void 90 operator()(const _PatternCursor& __pc, _Results& __r) 91 { __r._M_set_pos(_M_index, 0, __pc); } 92 93 int _M_index; 94 }; 95 96 template<typename _FwdIterT, typename _TraitsT> 97 struct _EndTagger 98 { 99 explicit 100 _EndTagger(int __i) 101 : _M_index(__i) 102 { } 103 104 void 105 operator()(const _PatternCursor& __pc, _Results& __r) 106 { __r._M_set_pos(_M_index, 1, __pc); } 107 108 int _M_index; 109 _FwdIterT _M_pos; 110 }; 111 // Indicates if current state matches cursor current. 112 typedef std::function<bool (const _PatternCursor&)> _Matcher; 113 114 // Matches any character 115 inline bool 116 _AnyMatcher(const _PatternCursor&) 117 { return true; } 118 119 // Matches a single character 120 template<typename _InIterT, typename _TraitsT> 121 struct _CharMatcher 122 { 123 typedef typename _TraitsT::char_type char_type; 124 125 explicit 126 _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT()) 127 : _M_traits(__t), _M_c(_M_traits.translate(__c)) 128 { } 129 130 bool 131 operator()(const _PatternCursor& __pc) const 132 { 133 typedef const _SpecializedCursor<_InIterT>& _CursorT; 134 _CursorT __c = static_cast<_CursorT>(__pc); 135 return _M_traits.translate(__c._M_current()) == _M_c; 136 } 137 138 const _TraitsT& _M_traits; 139 char_type _M_c; 140 }; 141 142 // Matches a character range (bracket expression) 143 template<typename _InIterT, typename _TraitsT> 144 struct _RangeMatcher 145 { 146 typedef typename _TraitsT::char_type _CharT; 147 typedef std::basic_string<_CharT> _StringT; 148 149 explicit 150 _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT()) 151 : _M_traits(__t), _M_is_non_matching(__is_non_matching) 152 { } 153 154 bool 155 operator()(const _PatternCursor& __pc) const 156 { 157 typedef const _SpecializedCursor<_InIterT>& _CursorT; 158 _CursorT __c = static_cast<_CursorT>(__pc); 159 return true; 160 } 161 162 void 163 _M_add_char(_CharT __c) 164 { } 165 166 void 167 _M_add_collating_element(const _StringT& __s) 168 { } 169 170 void 171 _M_add_equivalence_class(const _StringT& __s) 172 { } 173 174 void 175 _M_add_character_class(const _StringT& __s) 176 { } 177 178 void 179 _M_make_range() 180 { } 181 182 const _TraitsT& _M_traits; 183 bool _M_is_non_matching; 184 }; 185 186 // Identifies a state in the NFA. 187 typedef int _StateIdT; 188 189 // The special case in which a state identifier is not an index. 190 static const _StateIdT _S_invalid_state_id = -1; 191 192 193 // An individual state in an NFA 194 // 195 // In this case a "state" is an entry in the NFA definition coupled with its 196 // outgoing transition(s). All states have a single outgoing transition, 197 // except for accepting states (which have no outgoing transitions) and alt 198 // states, which have two outgoing transitions. 199 // 200 struct _State 201 { 202 typedef int _OpcodeT; 203 204 _OpcodeT _M_opcode; // type of outgoing transition 205 _StateIdT _M_next; // outgoing transition 206 _StateIdT _M_alt; // for _S_opcode_alternative 207 unsigned int _M_subexpr; // for _S_opcode_subexpr_* 208 _Tagger _M_tagger; // for _S_opcode_subexpr_* 209 _Matcher _M_matches; // for _S_opcode_match 210 211 explicit _State(_OpcodeT __opcode) 212 : _M_opcode(__opcode), _M_next(_S_invalid_state_id) 213 { } 214 215 _State(const _Matcher& __m) 216 : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m) 217 { } 218 219 _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t) 220 : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s), 221 _M_tagger(__t) 222 { } 223 224 _State(_StateIdT __next, _StateIdT __alt) 225 : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt) 226 { } 227 228#ifdef _GLIBCXX_DEBUG 229 std::ostream& 230 _M_print(std::ostream& ostr) const; 231 232 // Prints graphviz dot commands for state. 233 std::ostream& 234 _M_dot(std::ostream& __ostr, _StateIdT __id) const; 235#endif 236 }; 237 238 239 // The Grep Matcher works on sets of states. Here are sets of states. 240 typedef std::set<_StateIdT> _StateSet; 241 242 // A collection of all states making up an NFA 243 // 244 // An NFA is a 4-tuple M = (K, S, s, F), where 245 // K is a finite set of states, 246 // S is the alphabet of the NFA, 247 // s is the initial state, 248 // F is a set of final (accepting) states. 249 // 250 // This NFA class is templated on S, a type that will hold values of the 251 // underlying alphabet (without regard to semantics of that alphabet). The 252 // other elements of the tuple are generated during construction of the NFA 253 // and are available through accessor member functions. 254 // 255 class _Nfa 256 : public _Automaton, public std::vector<_State> 257 { 258 public: 259 typedef _State _StateT; 260 typedef unsigned int _SizeT; 261 typedef regex_constants::syntax_option_type _FlagT; 262 263 public: 264 _Nfa(_FlagT __f) 265 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0) 266 { } 267 268 ~_Nfa() 269 { } 270 271 _FlagT 272 _M_options() const 273 { return _M_flags; } 274 275 _StateIdT 276 _M_start() const 277 { return _M_start_state; } 278 279 const _StateSet& 280 _M_final_states() const 281 { return _M_accepting_states; } 282 283 _SizeT 284 _M_sub_count() const 285 { return _M_subexpr_count; } 286 287 _StateIdT 288 _M_insert_accept() 289 { 290 this->push_back(_StateT(_S_opcode_accept)); 291 _M_accepting_states.insert(this->size()-1); 292 return this->size()-1; 293 } 294 295 _StateIdT 296 _M_insert_alt(_StateIdT __next, _StateIdT __alt) 297 { 298 this->push_back(_StateT(__next, __alt)); 299 return this->size()-1; 300 } 301 302 _StateIdT 303 _M_insert_matcher(_Matcher __m) 304 { 305 this->push_back(_StateT(__m)); 306 return this->size()-1; 307 } 308 309 _StateIdT 310 _M_insert_subexpr_begin(const _Tagger& __t) 311 { 312 this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t)); 313 return this->size()-1; 314 } 315 316 _StateIdT 317 _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t) 318 { 319 this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t)); 320 return this->size()-1; 321 } 322 323#ifdef _GLIBCXX_DEBUG 324 std::ostream& 325 _M_dot(std::ostream& __ostr) const; 326#endif 327 328 private: 329 _FlagT _M_flags; 330 _StateIdT _M_start_state; 331 _StateSet _M_accepting_states; 332 _SizeT _M_subexpr_count; 333 }; 334 335 // Describes a sequence of one or more %_State, its current start and end(s). 336 // 337 // This structure contains fragments of an NFA during construction. 338 class _StateSeq 339 { 340 public: 341 // Constructs a single-node sequence 342 _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id) 343 : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e) 344 { } 345 // Constructs a split sequence from two other sequencces 346 _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2) 347 : _M_nfa(__e1._M_nfa), 348 _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)), 349 _M_end1(__e1._M_end1), _M_end2(__e2._M_end1) 350 { } 351 352 // Constructs a split sequence from a single sequence 353 _StateSeq(const _StateSeq& __e, _StateIdT __id) 354 : _M_nfa(__e._M_nfa), 355 _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)), 356 _M_end1(__id), _M_end2(__e._M_end1) 357 { } 358 359 // Constructs a copy of a %_StateSeq 360 _StateSeq(const _StateSeq& __rhs) 361 : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start), 362 _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2) 363 { } 364 365 366 _StateSeq& operator=(const _StateSeq& __rhs); 367 368 _StateIdT 369 _M_front() const 370 { return _M_start; } 371 372 // Extends a sequence by one. 373 void 374 _M_push_back(_StateIdT __id); 375 376 // Extends and maybe joins a sequence. 377 void 378 _M_append(_StateIdT __id); 379 380 void 381 _M_append(_StateSeq& __rhs); 382 383 // Clones an entire sequence. 384 _StateIdT 385 _M_clone(); 386 387 private: 388 _Nfa& _M_nfa; 389 _StateIdT _M_start; 390 _StateIdT _M_end1; 391 _StateIdT _M_end2; 392 393 }; 394 395_GLIBCXX_END_NAMESPACE_VERSION 396} // namespace __regex 397} // namespace std 398 399#include <bits/regex_nfa.tcc> 400 401