repattrn.cpp revision c7f5f8508d98d5952d42ed7648c2a8f30a4da156
1//
2//  file:  repattrn.cpp
3//
4/*
5***************************************************************************
6*   Copyright (C) 2002-2008 International Business Machines Corporation   *
7*   and others. All rights reserved.                                      *
8***************************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_REGULAR_EXPRESSIONS
14
15#include "unicode/regex.h"
16#include "unicode/uclean.h"
17#include "uassert.h"
18#include "uvector.h"
19#include "uvectr32.h"
20#include "regexcmp.h"
21#include "regeximp.h"
22#include "regexst.h"
23
24U_NAMESPACE_BEGIN
25
26//--------------------------------------------------------------------------
27//
28//    RegexPattern    Default Constructor
29//
30//--------------------------------------------------------------------------
31RegexPattern::RegexPattern() {
32    UErrorCode status = U_ZERO_ERROR;
33    u_init(&status);
34    // Init all of this instances data.
35    init();
36
37    // Lazy init of all shared global sets.
38    RegexStaticSets::initGlobals(&fDeferredStatus);
39}
40
41
42//--------------------------------------------------------------------------
43//
44//   Copy Constructor        Note:  This is a rather inefficient implementation,
45//                                  but it probably doesn't matter.
46//
47//--------------------------------------------------------------------------
48RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
49    init();
50    *this = other;
51}
52
53
54
55//--------------------------------------------------------------------------
56//
57//    Assignmenet Operator
58//
59//--------------------------------------------------------------------------
60RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
61    if (this == &other) {
62        // Source and destination are the same.  Don't do anything.
63        return *this;
64    }
65
66    // Clean out any previous contents of object being assigned to.
67    zap();
68
69    // Give target object a default initialization
70    init();
71
72    // Copy simple fields
73    fPattern          = other.fPattern;
74    fFlags            = other.fFlags;
75    fLiteralText      = other.fLiteralText;
76    fDeferredStatus   = other.fDeferredStatus;
77    fMinMatchLen      = other.fMinMatchLen;
78    fFrameSize        = other.fFrameSize;
79    fDataSize         = other.fDataSize;
80    fMaxCaptureDigits = other.fMaxCaptureDigits;
81    fStaticSets       = other.fStaticSets;
82    fStaticSets8      = other.fStaticSets8;
83
84    fStartType        = other.fStartType;
85    fInitialStringIdx = other.fInitialStringIdx;
86    fInitialStringLen = other.fInitialStringLen;
87    *fInitialChars    = *other.fInitialChars;
88    fInitialChar      = other.fInitialChar;
89    *fInitialChars8   = *other.fInitialChars8;
90
91    //  Copy the pattern.  It's just values, nothing deep to copy.
92    fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
93    fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
94
95    //  Copy the Unicode Sets.
96    //    Could be made more efficient if the sets were reference counted and shared,
97    //    but I doubt that pattern copying will be particularly common.
98    //    Note:  init() already added an empty element zero to fSets
99    int32_t i;
100    int32_t  numSets = other.fSets->size();
101    fSets8 = new Regex8BitSet[numSets];
102    if (fSets8 == NULL) {
103    	fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
104    	return *this;
105    }
106    for (i=1; i<numSets; i++) {
107        if (U_FAILURE(fDeferredStatus)) {
108            return *this;
109        }
110        UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
111        UnicodeSet *newSet    = new UnicodeSet(*sourceSet);
112        if (newSet == NULL) {
113            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
114            break;
115        }
116        fSets->addElement(newSet, fDeferredStatus);
117        fSets8[i] = other.fSets8[i];
118    }
119
120    return *this;
121}
122
123
124//--------------------------------------------------------------------------
125//
126//    init        Shared initialization for use by constructors.
127//                Bring an uninitialized RegexPattern up to a default state.
128//
129//--------------------------------------------------------------------------
130void RegexPattern::init() {
131    fPattern.remove();
132    fFlags            = 0;
133    fCompiledPat      = 0;
134    fLiteralText.remove();
135    fSets             = NULL;
136    fSets8            = NULL;
137    fDeferredStatus   = U_ZERO_ERROR;
138    fMinMatchLen      = 0;
139    fFrameSize        = 0;
140    fDataSize         = 0;
141    fGroupMap         = NULL;
142    fMaxCaptureDigits = 1;
143    fStaticSets       = NULL;
144    fStaticSets8      = NULL;
145    fStartType        = START_NO_INFO;
146    fInitialStringIdx = 0;
147    fInitialStringLen = 0;
148    fInitialChars     = NULL;
149    fInitialChar      = 0;
150    fInitialChars8    = NULL;
151
152    fCompiledPat      = new UVector32(fDeferredStatus);
153    fGroupMap         = new UVector32(fDeferredStatus);
154    fSets             = new UVector(fDeferredStatus);
155    fInitialChars     = new UnicodeSet;
156    fInitialChars8    = new Regex8BitSet;
157    if (U_FAILURE(fDeferredStatus)) {
158        return;
159    }
160    if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
161        fInitialChars == NULL || fInitialChars8 == NULL) {
162        fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
163        return;
164    }
165
166    // Slot zero of the vector of sets is reserved.  Fill it here.
167    fSets->addElement((int32_t)0, fDeferredStatus);
168}
169
170
171//--------------------------------------------------------------------------
172//
173//   zap            Delete everything owned by this RegexPattern.
174//
175//--------------------------------------------------------------------------
176void RegexPattern::zap() {
177    delete fCompiledPat;
178    fCompiledPat = NULL;
179    int i;
180    for (i=1; i<fSets->size(); i++) {
181        UnicodeSet *s;
182        s = (UnicodeSet *)fSets->elementAt(i);
183        if (s != NULL) {
184            delete s;
185        }
186    }
187    delete fSets;
188    fSets = NULL;
189    delete[] fSets8;
190    fSets8 = NULL;
191    delete fGroupMap;
192    fGroupMap = NULL;
193    delete fInitialChars;
194    fInitialChars = NULL;
195    delete fInitialChars8;
196    fInitialChars8 = NULL;
197}
198
199
200//--------------------------------------------------------------------------
201//
202//   Destructor
203//
204//--------------------------------------------------------------------------
205RegexPattern::~RegexPattern() {
206    zap();
207}
208
209
210//--------------------------------------------------------------------------
211//
212//   Clone
213//
214//--------------------------------------------------------------------------
215RegexPattern  *RegexPattern::clone() const {
216    RegexPattern  *copy = new RegexPattern(*this);
217    return copy;
218}
219
220
221//--------------------------------------------------------------------------
222//
223//   operator ==   (comparison)    Consider to patterns to be == if the
224//                                 pattern strings and the flags are the same.
225//
226//--------------------------------------------------------------------------
227UBool   RegexPattern::operator ==(const RegexPattern &other) const {
228    UBool r = this->fFlags    == other.fFlags &&
229              this->fPattern  == other.fPattern &&
230              this->fDeferredStatus == other.fDeferredStatus;
231    return r;
232}
233
234//---------------------------------------------------------------------
235//
236//   compile
237//
238//---------------------------------------------------------------------
239RegexPattern * U_EXPORT2
240RegexPattern::compile(const UnicodeString &regex,
241                      uint32_t             flags,
242                      UParseError          &pe,
243                      UErrorCode           &status)
244{
245
246    if (U_FAILURE(status)) {
247        return NULL;
248    }
249
250    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
251                              UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
252                              UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES;
253
254    if ((flags & ~allFlags) != 0) {
255        status = U_REGEX_INVALID_FLAG;
256        return NULL;
257    }
258
259    if ((flags & UREGEX_CANON_EQ) != 0) {
260        status = U_REGEX_UNIMPLEMENTED;
261        return NULL;
262    }
263
264    RegexPattern *This = new RegexPattern;
265    if (This == NULL) {
266        status = U_MEMORY_ALLOCATION_ERROR;
267        return NULL;
268    }
269    if (U_FAILURE(This->fDeferredStatus)) {
270        status = This->fDeferredStatus;
271        delete This;
272        return NULL;
273    }
274    This->fFlags = flags;
275
276    RegexCompile     compiler(This, status);
277    compiler.compile(regex, pe, status);
278
279    if (U_FAILURE(status)) {
280        delete This;
281        This = NULL;
282    }
283
284    return This;
285}
286
287//
288//   compile with default flags.
289//
290RegexPattern * U_EXPORT2
291RegexPattern::compile(const UnicodeString &regex,
292                      UParseError         &pe,
293                      UErrorCode          &err)
294{
295    return compile(regex, 0, pe, err);
296}
297
298
299
300//
301//   compile with no UParseErr parameter.
302//
303RegexPattern * U_EXPORT2
304RegexPattern::compile( const UnicodeString &regex,
305        uint32_t             flags,
306        UErrorCode           &err)
307{
308    UParseError pe;
309    return compile(regex, flags, pe, err);
310}
311
312
313
314//---------------------------------------------------------------------
315//
316//   flags
317//
318//---------------------------------------------------------------------
319uint32_t RegexPattern::flags() const {
320    return fFlags;
321}
322
323
324//---------------------------------------------------------------------
325//
326//   matcher(UnicodeString, err)
327//
328//---------------------------------------------------------------------
329RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
330                                    UErrorCode          &status)  const {
331    RegexMatcher    *retMatcher = matcher(status);
332    retMatcher->fDeferredStatus = status;
333    if (retMatcher != NULL) {
334        retMatcher->reset(input);
335    }
336    return retMatcher;
337}
338
339#if 0
340RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
341                                    UErrorCode          &status)  const
342{
343    /* This should never get called. The API with UnicodeString should be called instead. */
344    if (U_SUCCESS(status)) {
345        status = U_UNSUPPORTED_ERROR;
346    }
347    return NULL;
348}
349#endif
350
351//---------------------------------------------------------------------
352//
353//   matcher(status)
354//
355//---------------------------------------------------------------------
356RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
357    RegexMatcher    *retMatcher = NULL;
358
359    if (U_FAILURE(status)) {
360        return NULL;
361    }
362    if (U_FAILURE(fDeferredStatus)) {
363        status = fDeferredStatus;
364        return NULL;
365    }
366
367    retMatcher = new RegexMatcher(this);
368    if (retMatcher == NULL) {
369        status = U_MEMORY_ALLOCATION_ERROR;
370        return NULL;
371    }
372    return retMatcher;
373}
374
375
376
377//---------------------------------------------------------------------
378//
379//   matches        Convenience function to test for a match, starting
380//                  with a pattern string and a data string.
381//
382//---------------------------------------------------------------------
383UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
384              const UnicodeString   &input,
385                    UParseError     &pe,
386                    UErrorCode      &status) {
387
388    if (U_FAILURE(status)) {return FALSE;}
389
390    UBool         retVal;
391    RegexPattern *pat     = NULL;
392    RegexMatcher *matcher = NULL;
393
394    pat     = RegexPattern::compile(regex, 0, pe, status);
395    matcher = pat->matcher(input, status);
396    retVal  = matcher->matches(status);
397
398    delete matcher;
399    delete pat;
400    return retVal;
401}
402
403
404
405
406//---------------------------------------------------------------------
407//
408//   pattern
409//
410//---------------------------------------------------------------------
411UnicodeString RegexPattern::pattern() const {
412    return fPattern;
413}
414
415
416
417
418//---------------------------------------------------------------------
419//
420//   split
421//
422//---------------------------------------------------------------------
423int32_t  RegexPattern::split(const UnicodeString &input,
424        UnicodeString    dest[],
425        int32_t          destCapacity,
426        UErrorCode       &status) const
427{
428    if (U_FAILURE(status)) {
429        return 0;
430    };
431
432    RegexMatcher  m(this);
433    int32_t r = 0;
434    // Check m's status to make sure all is ok.
435    if (U_SUCCESS(m.fDeferredStatus)) {
436    	r = m.split(input, dest, destCapacity, status);
437    }
438    return r;
439}
440
441
442
443//---------------------------------------------------------------------
444//
445//   dump    Output the compiled form of the pattern.
446//           Debugging function only.
447//
448//---------------------------------------------------------------------
449#if defined(REGEX_DEBUG)
450void   RegexPattern::dumpOp(int32_t index) const {
451    static const char * const opNames[] = {URX_OPCODE_NAMES};
452    int32_t op          = fCompiledPat->elementAti(index);
453    int32_t val         = URX_VAL(op);
454    int32_t type        = URX_TYPE(op);
455    int32_t pinnedType  = type;
456    if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
457        pinnedType = 0;
458    }
459
460    REGEX_DUMP_DEBUG_PRINTF(("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]));
461    switch (type) {
462    case URX_NOP:
463    case URX_DOTANY:
464    case URX_DOTANY_ALL:
465    case URX_FAIL:
466    case URX_CARET:
467    case URX_DOLLAR:
468    case URX_BACKSLASH_G:
469    case URX_BACKSLASH_X:
470    case URX_END:
471    case URX_DOLLAR_M:
472    case URX_CARET_M:
473        // Types with no operand field of interest.
474        break;
475
476    case URX_RESERVED_OP:
477    case URX_START_CAPTURE:
478    case URX_END_CAPTURE:
479    case URX_STATE_SAVE:
480    case URX_JMP:
481    case URX_JMP_SAV:
482    case URX_JMP_SAV_X:
483    case URX_BACKSLASH_B:
484    case URX_BACKSLASH_BU:
485    case URX_BACKSLASH_D:
486    case URX_BACKSLASH_Z:
487    case URX_STRING_LEN:
488    case URX_CTR_INIT:
489    case URX_CTR_INIT_NG:
490    case URX_CTR_LOOP:
491    case URX_CTR_LOOP_NG:
492    case URX_RELOC_OPRND:
493    case URX_STO_SP:
494    case URX_LD_SP:
495    case URX_BACKREF:
496    case URX_STO_INP_LOC:
497    case URX_JMPX:
498    case URX_LA_START:
499    case URX_LA_END:
500    case URX_BACKREF_I:
501    case URX_LB_START:
502    case URX_LB_CONT:
503    case URX_LB_END:
504    case URX_LBN_CONT:
505    case URX_LBN_END:
506    case URX_LOOP_C:
507    case URX_LOOP_DOT_I:
508        // types with an integer operand field.
509        REGEX_DUMP_DEBUG_PRINTF(("%d", val));
510        break;
511
512    case URX_ONECHAR:
513    case URX_ONECHAR_I:
514        REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
515        break;
516
517    case URX_STRING:
518    case URX_STRING_I:
519        {
520            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
521            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
522            int32_t length = URX_VAL(lengthOp);
523            int32_t i;
524            for (i=val; i<val+length; i++) {
525                UChar c = fLiteralText[i];
526                if (c < 32 || c >= 256) {c = '.';}
527                REGEX_DUMP_DEBUG_PRINTF(("%c", c));
528            }
529        }
530        break;
531
532    case URX_SETREF:
533    case URX_LOOP_SR_I:
534        {
535            UnicodeString s;
536            UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
537            set->toPattern(s, TRUE);
538            for (int32_t i=0; i<s.length(); i++) {
539                REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
540            }
541        }
542        break;
543
544    case URX_STATIC_SETREF:
545    case URX_STAT_SETREF_N:
546        {
547            UnicodeString s;
548            if (val & URX_NEG_SET) {
549                REGEX_DUMP_DEBUG_PRINTF(("NOT "));
550                val &= ~URX_NEG_SET;
551            }
552            UnicodeSet *set = fStaticSets[val];
553            set->toPattern(s, TRUE);
554            for (int32_t i=0; i<s.length(); i++) {
555                REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
556            }
557        }
558        break;
559
560
561    default:
562        REGEX_DUMP_DEBUG_PRINTF(("??????"));
563        break;
564    }
565    REGEX_DUMP_DEBUG_PRINTF(("\n"));
566}
567#endif
568
569
570#if defined(REGEX_DEBUG)
571U_CAPI void  U_EXPORT2
572RegexPatternDump(const RegexPattern *This) {
573    int      index;
574    int      i;
575
576    REGEX_DUMP_DEBUG_PRINTF(("Original Pattern:  "));
577    for (i=0; i<This->fPattern.length(); i++) {
578        REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
579    }
580    REGEX_DUMP_DEBUG_PRINTF(("\n"));
581    REGEX_DUMP_DEBUG_PRINTF(("   Min Match Length:  %d\n", This->fMinMatchLen));
582    REGEX_DUMP_DEBUG_PRINTF(("   Match Start Type:  %s\n", START_OF_MATCH_STR(This->fStartType)));
583    if (This->fStartType == START_STRING) {
584        REGEX_DUMP_DEBUG_PRINTF(("    Initial match sting: \""));
585        for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
586            REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i]));   // TODO:  non-printables, surrogates.
587        }
588
589    } else if (This->fStartType == START_SET) {
590        int32_t numSetChars = This->fInitialChars->size();
591        if (numSetChars > 20) {
592            numSetChars = 20;
593        }
594        REGEX_DUMP_DEBUG_PRINTF(("     Match First Chars : "));
595        for (i=0; i<numSetChars; i++) {
596            UChar32 c = This->fInitialChars->charAt(i);
597            if (0x20<c && c <0x7e) {
598                REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
599            } else {
600                REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
601            }
602        }
603        if (numSetChars < This->fInitialChars->size()) {
604            REGEX_DUMP_DEBUG_PRINTF((" ..."));
605        }
606        REGEX_DUMP_DEBUG_PRINTF(("\n"));
607
608    } else if (This->fStartType == START_CHAR) {
609        REGEX_DUMP_DEBUG_PRINTF(("    First char of Match : "));
610        if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
611                REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
612            } else {
613                REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
614            }
615    }
616
617    REGEX_DUMP_DEBUG_PRINTF(("\nIndex   Binary     Type             Operand\n" \
618           "-------------------------------------------\n"));
619    for (index = 0; index<This->fCompiledPat->size(); index++) {
620        This->dumpOp(index);
621    }
622    REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
623}
624#endif
625
626
627
628UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
629
630U_NAMESPACE_END
631#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
632