1/*
2*******************************************************************************
3*   Copyright (C) 2004-2009, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  regex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "umutex.h"
20#include "uassert.h"
21#include "cmemory.h"
22
23U_NAMESPACE_USE
24
25struct URegularExpression: public UMemory {
26public:
27    URegularExpression();
28    ~URegularExpression();
29    int32_t           fMagic;
30    RegexPattern     *fPat;
31    int32_t          *fPatRefCount;
32    UChar            *fPatString;
33    int32_t           fPatStringLen;
34    RegexMatcher     *fMatcher;
35    const UChar      *fText;         // Text from setText()
36    int32_t           fTextLength;   // Length provided by user with setText(), which
37                                     //  may be -1.
38
39    UnicodeString     fTextString;   // The setText(text) is wrapped into a UnicodeString.
40                                     // TODO: regexp engine should not depend on UnicodeString.
41};
42
43static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
44
45URegularExpression::URegularExpression() {
46    fMagic        = REXP_MAGIC;
47    fPat          = NULL;
48    fPatRefCount  = NULL;
49    fPatString    = NULL;
50    fPatStringLen = 0;
51    fMatcher      = NULL;
52    fText         = NULL;
53    fTextLength   = 0;
54}
55
56URegularExpression::~URegularExpression() {
57    delete fMatcher;
58    fMatcher = NULL;
59    if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
60        delete fPat;
61        uprv_free(fPatString);
62        uprv_free(fPatRefCount);
63    }
64    fMagic = 0;
65}
66
67//----------------------------------------------------------------------------------------
68//
69//   validateRE    Do boilerplate style checks on API function parameters.
70//                 Return TRUE if they look OK.
71//----------------------------------------------------------------------------------------
72static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
73    if (U_FAILURE(*status)) {
74        return FALSE;
75    }
76    if (re == NULL || re->fMagic != REXP_MAGIC) {
77        *status = U_ILLEGAL_ARGUMENT_ERROR;
78        return FALSE;
79    }
80    if (requiresText && re->fText == NULL) {
81        *status = U_REGEX_INVALID_STATE;
82        return FALSE;
83    }
84    return TRUE;
85}
86
87//----------------------------------------------------------------------------------------
88//
89//    uregex_open
90//
91//----------------------------------------------------------------------------------------
92U_CAPI URegularExpression *  U_EXPORT2
93uregex_open( const  UChar          *pattern,
94                    int32_t         patternLength,
95                    uint32_t        flags,
96                    UParseError    *pe,
97                    UErrorCode     *status) {
98
99    if (U_FAILURE(*status)) {
100        return NULL;
101    }
102    if (pattern == NULL || patternLength < -1 || patternLength == 0) {
103        *status = U_ILLEGAL_ARGUMENT_ERROR;
104        return NULL;
105    }
106    int32_t actualPatLen = patternLength;
107    if (actualPatLen == -1) {
108        actualPatLen = u_strlen(pattern);
109    }
110
111    URegularExpression *re     = new URegularExpression;
112    int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
113    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
114    if (re == NULL || refC == NULL || patBuf == NULL) {
115        *status = U_MEMORY_ALLOCATION_ERROR;
116        delete re;
117        uprv_free(refC);
118        uprv_free(patBuf);
119        return NULL;
120    }
121    re->fPatRefCount = refC;
122    *re->fPatRefCount = 1;
123
124    //
125    // Make a copy of the pattern string, so we can return it later if asked.
126    //    For compiling the pattern, we will use a read-only-aliased UnicodeString
127    //    of this local copy, to avoid making even more copies.
128    //
129    re->fPatString    = patBuf;
130    re->fPatStringLen = patternLength;
131    u_memcpy(patBuf, pattern, actualPatLen);
132    patBuf[actualPatLen] = 0;
133    UnicodeString  patString(patternLength==-1, patBuf, patternLength);
134
135    //
136    // Compile the pattern
137    //
138    if (pe != NULL) {
139        re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
140    } else {
141        re->fPat = RegexPattern::compile(patString, flags, *status);
142    }
143    if (U_FAILURE(*status)) {
144        goto ErrorExit;
145    }
146
147    //
148    // Create the matcher object
149    //
150    re->fMatcher = re->fPat->matcher(*status);
151    if (U_SUCCESS(*status)) {
152        return re;
153    }
154
155ErrorExit:
156    delete re;
157    return NULL;
158
159}
160
161//----------------------------------------------------------------------------------------
162//
163//    uregex_close
164//
165//----------------------------------------------------------------------------------------
166U_CAPI void  U_EXPORT2
167uregex_close(URegularExpression  *re) {
168    UErrorCode  status = U_ZERO_ERROR;
169    if (validateRE(re, &status, FALSE) == FALSE) {
170        return;
171    }
172    delete re;
173}
174
175
176//----------------------------------------------------------------------------------------
177//
178//    uregex_clone
179//
180//----------------------------------------------------------------------------------------
181U_CAPI URegularExpression * U_EXPORT2
182uregex_clone(const URegularExpression *source, UErrorCode *status)  {
183    if (validateRE(source, status, FALSE) == FALSE) {
184        return NULL;
185    }
186
187    URegularExpression *clone = new URegularExpression;
188    if (clone == NULL) {
189        *status = U_MEMORY_ALLOCATION_ERROR;
190        return NULL;
191    }
192
193    clone->fMatcher = source->fPat->matcher(*status);
194    if (U_FAILURE(*status)) {
195        delete clone;
196        return NULL;
197    }
198
199    clone->fPat          = source->fPat;
200    clone->fPatRefCount  = source->fPatRefCount;
201    clone->fPatString    = source->fPatString;
202    clone->fPatStringLen = source->fPatStringLen;
203    umtx_atomic_inc(source->fPatRefCount);
204    // Note:  fText is not cloned.
205
206    return clone;
207}
208
209
210
211
212//------------------------------------------------------------------------------
213//
214//    uregex_pattern
215//
216//------------------------------------------------------------------------------
217U_CAPI const UChar * U_EXPORT2
218uregex_pattern(const  URegularExpression *regexp,
219               int32_t            *patLength,
220               UErrorCode         *status)  {
221
222    if (validateRE(regexp, status, FALSE) == FALSE) {
223        return NULL;
224    }
225    if (patLength != NULL) {
226        *patLength = regexp->fPatStringLen;
227    }
228    return regexp->fPatString;
229}
230
231
232//------------------------------------------------------------------------------
233//
234//    uregex_flags
235//
236//------------------------------------------------------------------------------
237U_CAPI int32_t U_EXPORT2
238uregex_flags(const URegularExpression *regexp, UErrorCode *status)  {
239    if (validateRE(regexp, status, FALSE) == FALSE) {
240        return 0;
241    }
242    int32_t flags = regexp->fPat->flags();
243    return flags;
244}
245
246
247//------------------------------------------------------------------------------
248//
249//    uregex_setText
250//
251//------------------------------------------------------------------------------
252U_CAPI void U_EXPORT2
253uregex_setText(URegularExpression *regexp,
254               const UChar        *text,
255               int32_t             textLength,
256               UErrorCode         *status)  {
257    if (validateRE(regexp, status, FALSE) == FALSE) {
258        return;
259    }
260    if (text == NULL || textLength < -1) {
261        *status = U_ILLEGAL_ARGUMENT_ERROR;
262        return;
263    }
264    regexp->fText       = text;
265    regexp->fTextLength = textLength;
266    UBool isTerminated  = (textLength == -1);
267
268    regexp->fTextString.setTo(isTerminated, text, textLength);
269    regexp->fMatcher->reset(regexp->fTextString);
270}
271
272
273
274//------------------------------------------------------------------------------
275//
276//    uregex_getText
277//
278//------------------------------------------------------------------------------
279U_CAPI const UChar * U_EXPORT2
280uregex_getText(URegularExpression *regexp,
281               int32_t            *textLength,
282               UErrorCode         *status)  {
283    if (validateRE(regexp, status, FALSE) == FALSE) {
284        return NULL;
285    }
286    if (textLength != NULL) {
287        *textLength = regexp->fTextLength;
288    }
289    return regexp->fText;
290}
291
292
293//------------------------------------------------------------------------------
294//
295//    uregex_matches
296//
297//------------------------------------------------------------------------------
298U_CAPI UBool U_EXPORT2
299uregex_matches(URegularExpression *regexp,
300                int32_t            startIndex,
301                UErrorCode        *status)  {
302    UBool result = FALSE;
303    if (validateRE(regexp, status) == FALSE) {
304        return result;
305    }
306    if (startIndex == -1) {
307        result = regexp->fMatcher->matches(*status);
308    } else {
309        result = regexp->fMatcher->matches(startIndex, *status);
310    }
311    return result;
312}
313
314
315
316//------------------------------------------------------------------------------
317//
318//    uregex_lookingAt
319//
320//------------------------------------------------------------------------------
321U_CAPI UBool U_EXPORT2
322uregex_lookingAt(URegularExpression *regexp,
323                 int32_t             startIndex,
324                 UErrorCode         *status)  {
325    UBool result = FALSE;
326    if (validateRE(regexp, status) == FALSE) {
327        return result;
328    }
329    if (startIndex == -1) {
330        result = regexp->fMatcher->lookingAt(*status);
331    } else {
332        result = regexp->fMatcher->lookingAt(startIndex, *status);
333    }
334    return result;
335}
336
337
338
339//------------------------------------------------------------------------------
340//
341//    uregex_find
342//
343//------------------------------------------------------------------------------
344U_CAPI UBool U_EXPORT2
345uregex_find(URegularExpression *regexp,
346            int32_t             startIndex,
347            UErrorCode         *status)  {
348    UBool result = FALSE;
349    if (validateRE(regexp, status) == FALSE) {
350        return result;
351    }
352    if (startIndex == -1) {
353        regexp->fMatcher->resetPreserveRegion();
354        result = regexp->fMatcher->find();
355    } else {
356        result = regexp->fMatcher->find(startIndex, *status);
357    }
358    return result;
359}
360
361//------------------------------------------------------------------------------
362//
363//    uregex_findNext
364//
365//------------------------------------------------------------------------------
366U_CAPI UBool U_EXPORT2
367uregex_findNext(URegularExpression *regexp,
368                UErrorCode         *status)  {
369    if (validateRE(regexp, status) == FALSE) {
370        return FALSE;
371    }
372    UBool result = regexp->fMatcher->find();
373    return result;
374}
375
376//------------------------------------------------------------------------------
377//
378//    uregex_groupCount
379//
380//------------------------------------------------------------------------------
381U_CAPI int32_t U_EXPORT2
382uregex_groupCount(URegularExpression *regexp,
383                  UErrorCode         *status)  {
384    if (validateRE(regexp, status, FALSE) == FALSE) {
385        return 0;
386    }
387    int32_t  result = regexp->fMatcher->groupCount();
388    return result;
389}
390
391
392//------------------------------------------------------------------------------
393//
394//    uregex_group
395//
396//------------------------------------------------------------------------------
397U_CAPI int32_t U_EXPORT2
398uregex_group(URegularExpression *regexp,
399             int32_t             groupNum,
400             UChar              *dest,
401             int32_t             destCapacity,
402             UErrorCode          *status)  {
403    if (validateRE(regexp, status) == FALSE) {
404        return 0;
405    }
406    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
407        *status = U_ILLEGAL_ARGUMENT_ERROR;
408        return 0;
409    }
410
411    //
412    // Pick up the range of characters from the matcher
413    //
414    int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
415    int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
416    if (U_FAILURE(*status)) {
417        return 0;
418    }
419
420    //
421    // Trim length based on buffer capacity
422    //
423    int32_t fullLength = endIx - startIx;
424    int32_t copyLength = fullLength;
425    if (copyLength < destCapacity) {
426        dest[copyLength] = 0;
427    } else  if (copyLength == destCapacity) {
428        *status = U_STRING_NOT_TERMINATED_WARNING;
429    } else {
430        copyLength = destCapacity;
431        *status = U_BUFFER_OVERFLOW_ERROR;
432    }
433
434    //
435    // Copy capture group to user's buffer
436    //
437    if (copyLength > 0) {
438        u_memcpy(dest, &regexp->fText[startIx], copyLength);
439    }
440    return fullLength;
441}
442
443
444//------------------------------------------------------------------------------
445//
446//    uregex_start
447//
448//------------------------------------------------------------------------------
449U_CAPI int32_t U_EXPORT2
450uregex_start(URegularExpression *regexp,
451             int32_t             groupNum,
452             UErrorCode          *status)  {
453    if (validateRE(regexp, status) == FALSE) {
454        return 0;
455    }
456    int32_t result = regexp->fMatcher->start(groupNum, *status);
457    return result;
458}
459
460
461//------------------------------------------------------------------------------
462//
463//    uregex_end
464//
465//------------------------------------------------------------------------------
466U_CAPI int32_t U_EXPORT2
467uregex_end(URegularExpression   *regexp,
468           int32_t               groupNum,
469           UErrorCode           *status)  {
470    if (validateRE(regexp, status) == FALSE) {
471        return 0;
472    }
473    int32_t result = regexp->fMatcher->end(groupNum, *status);
474    return result;
475}
476
477//------------------------------------------------------------------------------
478//
479//    uregex_reset
480//
481//------------------------------------------------------------------------------
482U_CAPI void U_EXPORT2
483uregex_reset(URegularExpression    *regexp,
484             int32_t               index,
485             UErrorCode            *status)  {
486    if (validateRE(regexp, status) == FALSE) {
487        return;
488    }
489    regexp->fMatcher->reset(index, *status);
490}
491
492
493//------------------------------------------------------------------------------
494//
495//    uregex_setRegion
496//
497//------------------------------------------------------------------------------
498U_CAPI void U_EXPORT2
499uregex_setRegion(URegularExpression   *regexp,
500                 int32_t               regionStart,
501                 int32_t               regionLimit,
502                 UErrorCode           *status)  {
503    if (validateRE(regexp, status) == FALSE) {
504        return;
505    }
506    regexp->fMatcher->region(regionStart, regionLimit, *status);
507}
508
509
510//------------------------------------------------------------------------------
511//
512//    uregex_regionStart
513//
514//------------------------------------------------------------------------------
515U_CAPI int32_t U_EXPORT2
516uregex_regionStart(const  URegularExpression   *regexp,
517                          UErrorCode           *status)  {
518    if (validateRE(regexp, status) == FALSE) {
519        return 0;
520    }
521    return regexp->fMatcher->regionStart();
522}
523
524
525//------------------------------------------------------------------------------
526//
527//    uregex_regionEnd
528//
529//------------------------------------------------------------------------------
530U_CAPI int32_t U_EXPORT2
531uregex_regionEnd(const  URegularExpression   *regexp,
532                        UErrorCode           *status)  {
533    if (validateRE(regexp, status) == FALSE) {
534        return 0;
535    }
536    return regexp->fMatcher->regionEnd();
537}
538
539
540//------------------------------------------------------------------------------
541//
542//    uregex_hasTransparentBounds
543//
544//------------------------------------------------------------------------------
545U_CAPI UBool U_EXPORT2
546uregex_hasTransparentBounds(const  URegularExpression   *regexp,
547                                   UErrorCode           *status)  {
548    if (validateRE(regexp, status) == FALSE) {
549        return FALSE;
550    }
551    return regexp->fMatcher->hasTransparentBounds();
552}
553
554
555//------------------------------------------------------------------------------
556//
557//    uregex_useTransparentBounds
558//
559//------------------------------------------------------------------------------
560U_CAPI void U_EXPORT2
561uregex_useTransparentBounds(URegularExpression    *regexp,
562             UBool                 b,
563             UErrorCode            *status)  {
564    if (validateRE(regexp, status) == FALSE) {
565        return;
566    }
567    regexp->fMatcher->useTransparentBounds(b);
568}
569
570
571//------------------------------------------------------------------------------
572//
573//    uregex_hasAnchoringBounds
574//
575//------------------------------------------------------------------------------
576U_CAPI UBool U_EXPORT2
577uregex_hasAnchoringBounds(const  URegularExpression   *regexp,
578                                   UErrorCode           *status)  {
579    if (validateRE(regexp, status) == FALSE) {
580        return FALSE;
581    }
582    return regexp->fMatcher->hasAnchoringBounds();
583}
584
585
586//------------------------------------------------------------------------------
587//
588//    uregex_useAnchoringBounds
589//
590//------------------------------------------------------------------------------
591U_CAPI void U_EXPORT2
592uregex_useAnchoringBounds(URegularExpression    *regexp,
593             UBool                 b,
594             UErrorCode            *status)  {
595    if (validateRE(regexp, status) == FALSE) {
596        return;
597    }
598    regexp->fMatcher->useAnchoringBounds(b);
599}
600
601
602//------------------------------------------------------------------------------
603//
604//    uregex_hitEnd
605//
606//------------------------------------------------------------------------------
607U_CAPI UBool U_EXPORT2
608uregex_hitEnd(const  URegularExpression   *regexp,
609                     UErrorCode           *status)  {
610    if (validateRE(regexp, status) == FALSE) {
611        return FALSE;
612    }
613    return regexp->fMatcher->hitEnd();
614}
615
616
617//------------------------------------------------------------------------------
618//
619//    uregex_requireEnd
620//
621//------------------------------------------------------------------------------
622U_CAPI UBool U_EXPORT2
623uregex_requireEnd(const  URegularExpression   *regexp,
624                         UErrorCode           *status)  {
625    if (validateRE(regexp, status) == FALSE) {
626        return FALSE;
627    }
628    return regexp->fMatcher->requireEnd();
629}
630
631
632//------------------------------------------------------------------------------
633//
634//    uregex_setTimeLimit
635//
636//------------------------------------------------------------------------------
637U_CAPI void U_EXPORT2
638uregex_setTimeLimit(URegularExpression   *regexp,
639                    int32_t               limit,
640                    UErrorCode           *status) {
641    if (validateRE(regexp, status)) {
642        regexp->fMatcher->setTimeLimit(limit, *status);
643    }
644}
645
646
647
648//------------------------------------------------------------------------------
649//
650//    uregex_getTimeLimit
651//
652//------------------------------------------------------------------------------
653U_CAPI int32_t U_EXPORT2
654uregex_getTimeLimit(const  URegularExpression   *regexp,
655                           UErrorCode           *status) {
656    int32_t retVal = 0;
657    if (validateRE(regexp, status)) {
658        retVal = regexp->fMatcher->getTimeLimit();
659    }
660    return retVal;
661}
662
663
664
665//------------------------------------------------------------------------------
666//
667//    uregex_setStackLimit
668//
669//------------------------------------------------------------------------------
670U_CAPI void U_EXPORT2
671uregex_setStackLimit(URegularExpression   *regexp,
672                    int32_t               limit,
673                    UErrorCode           *status) {
674    if (validateRE(regexp, status)) {
675        regexp->fMatcher->setStackLimit(limit, *status);
676    }
677}
678
679
680
681//------------------------------------------------------------------------------
682//
683//    uregex_getStackLimit
684//
685//------------------------------------------------------------------------------
686U_CAPI int32_t U_EXPORT2
687uregex_getStackLimit(const  URegularExpression   *regexp,
688                           UErrorCode           *status) {
689    int32_t retVal = 0;
690    if (validateRE(regexp, status)) {
691        retVal = regexp->fMatcher->getStackLimit();
692    }
693    return retVal;
694}
695
696
697//------------------------------------------------------------------------------
698//
699//    uregex_setMatchCallback
700//
701//------------------------------------------------------------------------------
702U_CAPI void U_EXPORT2
703uregex_setMatchCallback(URegularExpression      *regexp,
704                        URegexMatchCallback     *callback,
705                        const void              *context,
706                        UErrorCode              *status) {
707    if (validateRE(regexp, status)) {
708      regexp->fMatcher->setMatchCallback(callback, context, *status);
709    }
710}
711
712
713//------------------------------------------------------------------------------
714//
715//    uregex_getMatchCallback
716//
717//------------------------------------------------------------------------------
718U_CAPI void U_EXPORT2
719uregex_getMatchCallback(const URegularExpression    *regexp,
720                        URegexMatchCallback        **callback,
721                        const void                 **context,
722                        UErrorCode                  *status) {
723     if (validateRE(regexp, status)) {
724         regexp->fMatcher->getMatchCallback(*callback, *context, *status);
725     }
726}
727
728
729//------------------------------------------------------------------------------
730//
731//    uregex_replaceAll
732//
733//------------------------------------------------------------------------------
734U_CAPI int32_t U_EXPORT2
735uregex_replaceAll(URegularExpression    *regexp,
736                  const UChar           *replacementText,
737                  int32_t                replacementLength,
738                  UChar                 *destBuf,
739                  int32_t                destCapacity,
740                  UErrorCode            *status)  {
741    if (validateRE(regexp, status) == FALSE) {
742        return 0;
743    }
744    if (replacementText == NULL || replacementLength < -1 ||
745        destBuf == NULL && destCapacity > 0 ||
746        destCapacity < 0) {
747        *status = U_ILLEGAL_ARGUMENT_ERROR;
748        return 0;
749    }
750
751    int32_t   len = 0;
752
753    uregex_reset(regexp, 0, status);
754
755    // Note: Seperate error code variables for findNext() and appendReplacement()
756    //       are used so that destination buffer overflow errors
757    //       in appendReplacement won't stop findNext() from working.
758    //       appendReplacement() and appendTail() special case incoming buffer
759    //       overflow errors, continuing to return the correct length.
760    UErrorCode  findStatus = *status;
761    while (uregex_findNext(regexp, &findStatus)) {
762        len += uregex_appendReplacement(regexp, replacementText, replacementLength,
763                                        &destBuf, &destCapacity, status);
764    }
765    len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
766
767    if (U_FAILURE(findStatus)) {
768        // If anything went wrong with the findNext(), make that error trump
769        //   whatever may have happened with the append() operations.
770        //   Errors in findNext() are not expected.
771        *status = findStatus;
772    }
773
774    return len;
775}
776
777
778//------------------------------------------------------------------------------
779//
780//    uregex_replaceFirst
781//
782//------------------------------------------------------------------------------
783U_CAPI int32_t U_EXPORT2
784uregex_replaceFirst(URegularExpression  *regexp,
785                    const UChar         *replacementText,
786                    int32_t              replacementLength,
787                    UChar               *destBuf,
788                    int32_t              destCapacity,
789                    UErrorCode          *status)  {
790    if (validateRE(regexp, status) == FALSE) {
791        return 0;
792    }
793    if (replacementText == NULL || replacementLength < -1 ||
794        destBuf == NULL && destCapacity > 0 ||
795        destCapacity < 0) {
796        *status = U_ILLEGAL_ARGUMENT_ERROR;
797        return 0;
798    }
799
800    int32_t   len = 0;
801    UBool     findSucceeded;
802    uregex_reset(regexp, 0, status);
803    findSucceeded = uregex_find(regexp, 0, status);
804    if (findSucceeded) {
805        len = uregex_appendReplacement(regexp, replacementText, replacementLength,
806                                       &destBuf, &destCapacity, status);
807    }
808    len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
809
810    return len;
811}
812
813
814//------------------------------------------------------------------------------
815//
816//    uregex_appendReplacement
817//
818//------------------------------------------------------------------------------
819
820
821//
822//  Dummy class, because these functions need to be friends of class RegexMatcher,
823//               and stand-alone C functions don't work as friends
824//
825U_NAMESPACE_BEGIN
826class RegexCImpl {
827 public:
828   inline static  int32_t appendReplacement(URegularExpression    *regexp,
829                      const UChar           *replacementText,
830                      int32_t                replacementLength,
831                      UChar                **destBuf,
832                      int32_t               *destCapacity,
833                      UErrorCode            *status);
834
835   inline static int32_t appendTail(URegularExpression    *regexp,
836                  UChar                **destBuf,
837                  int32_t               *destCapacity,
838                  UErrorCode            *status);
839};
840U_NAMESPACE_END
841
842
843//
844//  Call-back function for u_unescapeAt(), used when we encounter
845//    \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
846//
847U_CDECL_BEGIN
848static UChar U_CALLCONV
849unescape_charAt(int32_t offset, void *context) {
850    UChar c16 = ((UChar *)context)[offset];
851    return c16;
852}
853U_CDECL_END
854
855
856static const UChar BACKSLASH  = 0x5c;
857static const UChar DOLLARSIGN = 0x24;
858
859//
860//  Move a character to an output buffer, with bounds checking on the index.
861//      Index advances even if capacity is exceeded, for preflight size computations.
862//      This little sequence is used a LOT.
863//
864static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
865    if (*idx < bufCapacity) {
866        buf[*idx] = c;
867    }
868    (*idx)++;
869}
870
871
872//
873//  appendReplacement, the actual implementation.
874//
875int32_t RegexCImpl::appendReplacement(URegularExpression    *regexp,
876                  const UChar           *replacementText,
877                  int32_t                replacementLength,
878                  UChar                **destBuf,
879                  int32_t               *destCapacity,
880                  UErrorCode            *status)  {
881
882    // If we come in with a buffer overflow error, don't suppress the operation.
883    //  A series of appendReplacements, appendTail need to correctly preflight
884    //  the buffer size when an overflow happens somewhere in the middle.
885    UBool pendingBufferOverflow = FALSE;
886    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
887        pendingBufferOverflow = TRUE;
888        *status = U_ZERO_ERROR;
889    }
890
891    //
892    // Validate all paramters
893    //
894    if (validateRE(regexp, status) == FALSE) {
895        return 0;
896    }
897    if (replacementText == NULL || replacementLength < -1 ||
898        destCapacity == NULL || destBuf == NULL ||
899        *destBuf == NULL && *destCapacity > 0 ||
900        *destCapacity < 0) {
901        *status = U_ILLEGAL_ARGUMENT_ERROR;
902        return 0;
903    }
904
905    RegexMatcher *m = regexp->fMatcher;
906    if (m->fMatch == FALSE) {
907        *status = U_REGEX_INVALID_STATE;
908        return 0;
909    }
910
911    UChar    *dest             = *destBuf;
912    int32_t   capacity         = *destCapacity;
913    int32_t   destIdx          =  0;
914    int32_t   i;
915
916    // If it wasn't supplied by the caller,  get the length of the replacement text.
917    //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
918    //          the fly and avoid this step.
919    if (replacementLength == -1) {
920        replacementLength = u_strlen(replacementText);
921    }
922
923    // Copy input string from the end of previous match to start of current match
924    for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
925        appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
926    }
927
928
929
930    // scan the replacement text, looking for substitutions ($n) and \escapes.
931    int32_t  replIdx = 0;
932    while (replIdx < replacementLength) {
933        UChar  c = replacementText[replIdx];
934        replIdx++;
935        if (c != DOLLARSIGN && c != BACKSLASH) {
936            // Common case, no substitution, no escaping,
937            //  just copy the char to the dest buf.
938            appendToBuf(c, &destIdx, dest, capacity);
939            continue;
940        }
941
942        if (c == BACKSLASH) {
943            // Backslash Escape.  Copy the following char out without further checks.
944            //                    Note:  Surrogate pairs don't need any special handling
945            //                           The second half wont be a '$' or a '\', and
946            //                           will move to the dest normally on the next
947            //                           loop iteration.
948            if (replIdx >= replacementLength) {
949                break;
950            }
951            c = replacementText[replIdx];
952
953            if (c==0x55/*U*/ || c==0x75/*u*/) {
954                // We have a \udddd or \Udddddddd escape sequence.
955                UChar32 escapedChar =
956                    u_unescapeAt(unescape_charAt,
957                       &replIdx,                   // Index is updated by unescapeAt
958                       replacementLength,          // Length of replacement text
959                       (void *)replacementText);
960
961                if (escapedChar != (UChar32)0xFFFFFFFF) {
962                    if (escapedChar <= 0xffff) {
963                        appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
964                    } else {
965                        appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
966                        appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
967                    }
968                    continue;
969                }
970                // Note:  if the \u escape was invalid, just fall through and
971                //        treat it as a plain \<anything> escape.
972            }
973
974            // Plain backslash escape.  Just put out the escaped character.
975            appendToBuf(c, &destIdx, dest, capacity);
976
977            replIdx++;
978            continue;
979        }
980
981
982
983        // We've got a $.  Pick up a capture group number if one follows.
984        // Consume at most the number of digits necessary for the largest capture
985        // number that is valid for this pattern.
986
987        int32_t numDigits = 0;
988        int32_t groupNum  = 0;
989        UChar32 digitC;
990        for (;;) {
991            if (replIdx >= replacementLength) {
992                break;
993            }
994            U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
995            if (u_isdigit(digitC) == FALSE) {
996                break;
997            }
998
999            U16_FWD_1(replacementText, replIdx, replacementLength);
1000            groupNum=groupNum*10 + u_charDigitValue(digitC);
1001            numDigits++;
1002            if (numDigits >= m->fPattern->fMaxCaptureDigits) {
1003                break;
1004            }
1005        }
1006
1007
1008        if (numDigits == 0) {
1009            // The $ didn't introduce a group number at all.
1010            // Treat it as just part of the substitution text.
1011            appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
1012            continue;
1013        }
1014
1015        // Finally, append the capture group data to the destination.
1016        int32_t  capacityRemaining = capacity - destIdx;
1017        if (capacityRemaining < 0) {
1018            capacityRemaining = 0;
1019        }
1020        destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
1021        if (*status == U_BUFFER_OVERFLOW_ERROR) {
1022            // Ignore buffer overflow when extracting the group.  We need to
1023            //   continue on to get full size of the untruncated result.  We will
1024            //   raise our own buffer overflow error at the end.
1025            *status = U_ZERO_ERROR;
1026        }
1027
1028        if (U_FAILURE(*status)) {
1029            // Can fail if group number is out of range.
1030            break;
1031        }
1032
1033    }
1034
1035    //
1036    //  Nul Terminate the dest buffer if possible.
1037    //  Set the appropriate buffer overflow or not terminated error, if needed.
1038    //
1039    if (destIdx < capacity) {
1040        dest[destIdx] = 0;
1041    } else if (destIdx == *destCapacity) {
1042        *status = U_STRING_NOT_TERMINATED_WARNING;
1043    } else {
1044        *status = U_BUFFER_OVERFLOW_ERROR;
1045    }
1046
1047    //
1048    // Return an updated dest buffer and capacity to the caller.
1049    //
1050    if (destIdx > 0 &&  *destCapacity > 0) {
1051        if (destIdx < capacity) {
1052            *destBuf      += destIdx;
1053            *destCapacity -= destIdx;
1054        } else {
1055            *destBuf      += capacity;
1056            *destCapacity =  0;
1057        }
1058    }
1059
1060    // If we came in with a buffer overflow, make sure we go out with one also.
1061    //   (A zero length match right at the end of the previous match could
1062    //    make this function succeed even though a previous call had overflowed the buf)
1063    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1064        *status = U_BUFFER_OVERFLOW_ERROR;
1065    }
1066
1067    return destIdx;
1068}
1069
1070//
1071//   appendReplacement   the acutal API function,
1072//
1073U_CAPI int32_t U_EXPORT2
1074uregex_appendReplacement(URegularExpression    *regexp,
1075                  const UChar           *replacementText,
1076                  int32_t                replacementLength,
1077                  UChar                **destBuf,
1078                  int32_t               *destCapacity,
1079                  UErrorCode            *status) {
1080    return RegexCImpl::appendReplacement(
1081        regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1082}
1083
1084
1085//------------------------------------------------------------------------------
1086//
1087//    uregex_appendTail
1088//
1089//------------------------------------------------------------------------------
1090int32_t RegexCImpl::appendTail(URegularExpression    *regexp,
1091                  UChar                **destBuf,
1092                  int32_t               *destCapacity,
1093                  UErrorCode            *status)
1094{
1095
1096    // If we come in with a buffer overflow error, don't suppress the operation.
1097    //  A series of appendReplacements, appendTail need to correctly preflight
1098    //  the buffer size when an overflow happens somewhere in the middle.
1099    UBool pendingBufferOverflow = FALSE;
1100    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1101        pendingBufferOverflow = TRUE;
1102        *status = U_ZERO_ERROR;
1103    }
1104
1105    if (validateRE(regexp, status) == FALSE) {
1106        return 0;
1107    }
1108
1109    if (destCapacity == NULL || destBuf == NULL ||
1110        *destBuf == NULL && *destCapacity > 0 ||
1111        *destCapacity < 0)
1112    {
1113        *status = U_ILLEGAL_ARGUMENT_ERROR;
1114        return 0;
1115    }
1116
1117    RegexMatcher *m = regexp->fMatcher;
1118
1119    int32_t  srcIdx;
1120    if (m->fMatch) {
1121        // The most recent call to find() succeeded.
1122        srcIdx = m->fMatchEnd;
1123    } else {
1124        // The last call to find() on this matcher failed().
1125        //   Look back to the end of the last find() that succeeded for src index.
1126        srcIdx = m->fLastMatchEnd;
1127        if (srcIdx == -1)  {
1128            // There has been no successful match with this matcher.
1129            //   We want to copy the whole string.
1130            srcIdx = 0;
1131        }
1132    }
1133
1134    int32_t  destIdx     = 0;
1135    int32_t  destCap     = *destCapacity;
1136    UChar    *dest       = *destBuf;
1137
1138    for (;;) {
1139        if (srcIdx == regexp->fTextLength) {
1140            break;
1141        }
1142        UChar c = regexp->fText[srcIdx];
1143        if (c == 0 && regexp->fTextLength == -1) {
1144            break;
1145        }
1146        if (destIdx < destCap) {
1147            dest[destIdx] = c;
1148        } else {
1149            // We've overflowed the dest buffer.
1150            //  If the total input string length is known, we can
1151            //    compute the total buffer size needed without scanning through the string.
1152            if (regexp->fTextLength > 0) {
1153                destIdx += (regexp->fTextLength - srcIdx);
1154                break;
1155            }
1156        }
1157        srcIdx++;
1158        destIdx++;
1159    }
1160
1161    //
1162    //  NUL terminate the output string, if possible, otherwise issue the
1163    //   appropriate error or warning.
1164    //
1165    if (destIdx < destCap) {
1166        dest[destIdx] = 0;
1167    } else  if (destIdx == destCap) {
1168        *status = U_STRING_NOT_TERMINATED_WARNING;
1169    } else {
1170        *status = U_BUFFER_OVERFLOW_ERROR;
1171    }
1172
1173    //
1174    // Update the user's buffer ptr and capacity vars to reflect the
1175    //   amount used.
1176    //
1177    if (destIdx < destCap) {
1178        *destBuf      += destIdx;
1179        *destCapacity -= destIdx;
1180    } else {
1181        *destBuf      += destCap;
1182        *destCapacity  = 0;
1183    }
1184
1185    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1186        *status = U_BUFFER_OVERFLOW_ERROR;
1187    }
1188
1189    return destIdx;
1190}
1191
1192
1193U_CAPI int32_t U_EXPORT2
1194uregex_appendTail(URegularExpression    *regexp,
1195                  UChar                **destBuf,
1196                  int32_t               *destCapacity,
1197                  UErrorCode            *status)  {
1198    return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1199}
1200
1201
1202//------------------------------------------------------------------------------
1203//
1204//    copyString     Internal utility to copy a string to an output buffer,
1205//                   while managing buffer overflow and preflight size
1206//                   computation.  NUL termination is added to destination,
1207//                   and the NUL is counted in the output size.
1208//
1209//------------------------------------------------------------------------------
1210static void copyString(UChar        *destBuffer,    //  Destination buffer.
1211                       int32_t       destCapacity,  //  Total capacity of dest buffer
1212                       int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
1213                                                    //    Update not clipped to destCapacity.
1214                       const UChar  *srcPtr,        //  Pointer to source string
1215                       int32_t       srcLen)        //  Source string len.
1216{
1217    int32_t  si;
1218    int32_t  di = *destIndex;
1219    UChar    c;
1220
1221    for (si=0; si<srcLen;  si++) {
1222        c = srcPtr[si];
1223        if (di < destCapacity) {
1224            destBuffer[di] = c;
1225            di++;
1226        } else {
1227            di += srcLen - si;
1228            break;
1229        }
1230    }
1231    if (di<destCapacity) {
1232        destBuffer[di] = 0;
1233    }
1234    di++;
1235    *destIndex = di;
1236}
1237
1238
1239//------------------------------------------------------------------------------
1240//
1241//    uregex_split
1242//
1243//------------------------------------------------------------------------------
1244U_CAPI int32_t U_EXPORT2
1245uregex_split(   URegularExpression      *regexp,
1246                  UChar                 *destBuf,
1247                  int32_t                destCapacity,
1248                  int32_t               *requiredCapacity,
1249                  UChar                 *destFields[],
1250                  int32_t                destFieldsCapacity,
1251                  UErrorCode            *status) {
1252    if (validateRE(regexp, status) == FALSE) {
1253        return 0;
1254    }
1255    if (destBuf == NULL && destCapacity > 0 ||
1256        destCapacity < 0 ||
1257        destFields == NULL ||
1258        destFieldsCapacity < 1 ) {
1259        *status = U_ILLEGAL_ARGUMENT_ERROR;
1260        return 0;
1261    }
1262
1263    //
1264    // Reset for the input text
1265    //
1266    regexp->fMatcher->reset();
1267    int32_t   inputLen = regexp->fTextString.length();
1268    int32_t   nextOutputStringStart = 0;
1269    if (inputLen == 0) {
1270        return 0;
1271    }
1272
1273
1274    //
1275    // Loop through the input text, searching for the delimiter pattern
1276    //
1277    int32_t   i;             // Index of the field being processed.
1278    int32_t   destIdx = 0;   // Next available position in destBuf;
1279    int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
1280    for (i=0; ; i++) {
1281        if (i>=destFieldsCapacity-1) {
1282            // There are one or zero output string left.
1283            // Fill the last output string with whatever is left from the input, then exit the loop.
1284            //  ( i will be == destFieldsCapacity if we filled the output array while processing
1285            //    capture groups of the delimiter expression, in which case we will discard the
1286            //    last capture group saved in favor of the unprocessed remainder of the
1287            //    input string.)
1288            int32_t remainingLength = inputLen-nextOutputStringStart;
1289            if (remainingLength > 0) {
1290            }
1291            if (i >= destFieldsCapacity) {
1292                // No fields are left.  Recycle the last one for holding the trailing part of
1293                //   the input string.
1294                i = destFieldsCapacity-1;
1295                destIdx = (int32_t)(destFields[i] - destFields[0]);
1296            }
1297
1298            destFields[i] = &destBuf[destIdx];
1299            copyString(destBuf, destCapacity, &destIdx,
1300                &regexp->fText[nextOutputStringStart], remainingLength);
1301            break;
1302        }
1303
1304        if (regexp->fMatcher->find()) {
1305            // We found another delimiter.  Move everything from where we started looking
1306            //  up until the start of the delimiter into the next output string.
1307            int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
1308            destFields[i] = &destBuf[destIdx];
1309            copyString(destBuf, destCapacity, &destIdx,
1310                &regexp->fText[nextOutputStringStart], fieldLen);
1311            nextOutputStringStart =  regexp->fMatcher->end(*status);
1312
1313            // If the delimiter pattern has capturing parentheses, the captured
1314            //  text goes out into the next n destination strings.
1315            int32_t groupNum;
1316            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1317                // If we've run out of output string slots, bail out.
1318                if (i==destFieldsCapacity-1) {
1319                    break;
1320                }
1321                i++;
1322
1323                // Set up to extract the capture group contents into the dest buffer.
1324                UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow
1325                                                      //  error while extracting this group.
1326                int32_t remainingCapacity = destCapacity - destIdx;
1327                if (remainingCapacity < 0) {
1328                    remainingCapacity = 0;
1329                }
1330                destFields[i] = &destBuf[destIdx];
1331                int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
1332                destIdx += t + 1;    // Record the space used in the output string buffer.
1333                                     //  +1 for the NUL that terminates the string.
1334            }
1335
1336            if (nextOutputStringStart == inputLen) {
1337                // The delimiter was at the end of the string.  We're done.
1338                break;
1339            }
1340
1341        }
1342        else
1343        {
1344            // We ran off the end of the input while looking for the next delimiter.
1345            // All the remaining text goes into the current output string.
1346            destFields[i] = &destBuf[destIdx];
1347            copyString(destBuf, destCapacity, &destIdx,
1348                         &regexp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
1349            break;
1350        }
1351    }
1352
1353    // Zero out any unused portion of the destFields array
1354    int j;
1355    for (j=i+1; j<destFieldsCapacity; j++) {
1356        destFields[j] = NULL;
1357    }
1358
1359    if (requiredCapacity != NULL) {
1360        *requiredCapacity = destIdx;
1361    }
1362    if (destIdx > destCapacity) {
1363        *status = U_BUFFER_OVERFLOW_ERROR;
1364    }
1365    return i+1;
1366}
1367
1368
1369#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1370
1371