1/*
2*******************************************************************************
3*   Copyright (C) 2004-2013, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  uregex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "unicode/utf16.h"
20#include "umutex.h"
21#include "uassert.h"
22#include "cmemory.h"
23
24#include "regextxt.h"
25
26#include <stdio.h>
27
28U_NAMESPACE_BEGIN
29
30#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
31
32struct RegularExpression: public UMemory {
33public:
34    RegularExpression();
35    ~RegularExpression();
36    int32_t           fMagic;
37    RegexPattern     *fPat;
38    u_atomic_int32_t *fPatRefCount;
39    UChar            *fPatString;
40    int32_t           fPatStringLen;
41    RegexMatcher     *fMatcher;
42    const UChar      *fText;         // Text from setText()
43    int32_t           fTextLength;   // Length provided by user with setText(), which
44                                     //  may be -1.
45    UBool             fOwnsText;
46};
47
48static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
49
50RegularExpression::RegularExpression() {
51    fMagic        = REXP_MAGIC;
52    fPat          = NULL;
53    fPatRefCount  = NULL;
54    fPatString    = NULL;
55    fPatStringLen = 0;
56    fMatcher      = NULL;
57    fText         = NULL;
58    fTextLength   = 0;
59    fOwnsText     = FALSE;
60}
61
62RegularExpression::~RegularExpression() {
63    delete fMatcher;
64    fMatcher = NULL;
65    if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
66        delete fPat;
67        uprv_free(fPatString);
68        uprv_free((void *)fPatRefCount);
69    }
70    if (fOwnsText && fText!=NULL) {
71        uprv_free((void *)fText);
72    }
73    fMagic = 0;
74}
75
76U_NAMESPACE_END
77
78U_NAMESPACE_USE
79
80//----------------------------------------------------------------------------------------
81//
82//   validateRE    Do boilerplate style checks on API function parameters.
83//                 Return TRUE if they look OK.
84//----------------------------------------------------------------------------------------
85static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
86    if (U_FAILURE(*status)) {
87        return FALSE;
88    }
89    if (re == NULL || re->fMagic != REXP_MAGIC) {
90        *status = U_ILLEGAL_ARGUMENT_ERROR;
91        return FALSE;
92    }
93    // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94    if (requiresText && re->fText == NULL && !re->fOwnsText) {
95        *status = U_REGEX_INVALID_STATE;
96        return FALSE;
97    }
98    return TRUE;
99}
100
101//----------------------------------------------------------------------------------------
102//
103//    uregex_open
104//
105//----------------------------------------------------------------------------------------
106U_CAPI URegularExpression *  U_EXPORT2
107uregex_open( const  UChar          *pattern,
108                    int32_t         patternLength,
109                    uint32_t        flags,
110                    UParseError    *pe,
111                    UErrorCode     *status) {
112
113    if (U_FAILURE(*status)) {
114        return NULL;
115    }
116    if (pattern == NULL || patternLength < -1 || patternLength == 0) {
117        *status = U_ILLEGAL_ARGUMENT_ERROR;
118        return NULL;
119    }
120    int32_t actualPatLen = patternLength;
121    if (actualPatLen == -1) {
122        actualPatLen = u_strlen(pattern);
123    }
124
125    RegularExpression  *re     = new RegularExpression;
126    u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
127    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
128    if (re == NULL || refC == NULL || patBuf == NULL) {
129        *status = U_MEMORY_ALLOCATION_ERROR;
130        delete re;
131        uprv_free((void *)refC);
132        uprv_free(patBuf);
133        return NULL;
134    }
135    re->fPatRefCount = refC;
136    *re->fPatRefCount = 1;
137
138    //
139    // Make a copy of the pattern string, so we can return it later if asked.
140    //    For compiling the pattern, we will use a UText wrapper around
141    //    this local copy, to avoid making even more copies.
142    //
143    re->fPatString    = patBuf;
144    re->fPatStringLen = patternLength;
145    u_memcpy(patBuf, pattern, actualPatLen);
146    patBuf[actualPatLen] = 0;
147
148    UText patText = UTEXT_INITIALIZER;
149    utext_openUChars(&patText, patBuf, patternLength, status);
150
151    //
152    // Compile the pattern
153    //
154    if (pe != NULL) {
155        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
156    } else {
157        re->fPat = RegexPattern::compile(&patText, flags, *status);
158    }
159    utext_close(&patText);
160
161    if (U_FAILURE(*status)) {
162        goto ErrorExit;
163    }
164
165    //
166    // Create the matcher object
167    //
168    re->fMatcher = re->fPat->matcher(*status);
169    if (U_SUCCESS(*status)) {
170        return (URegularExpression*)re;
171    }
172
173ErrorExit:
174    delete re;
175    return NULL;
176
177}
178
179//----------------------------------------------------------------------------------------
180//
181//    uregex_openUText
182//
183//----------------------------------------------------------------------------------------
184U_CAPI URegularExpression *  U_EXPORT2
185uregex_openUText(UText          *pattern,
186                 uint32_t        flags,
187                 UParseError    *pe,
188                 UErrorCode     *status) {
189
190    if (U_FAILURE(*status)) {
191        return NULL;
192    }
193    if (pattern == NULL) {
194        *status = U_ILLEGAL_ARGUMENT_ERROR;
195        return NULL;
196    }
197
198    int64_t patternNativeLength = utext_nativeLength(pattern);
199
200    if (patternNativeLength == 0) {
201        *status = U_ILLEGAL_ARGUMENT_ERROR;
202        return NULL;
203    }
204
205    RegularExpression *re     = new RegularExpression;
206
207    UErrorCode lengthStatus = U_ZERO_ERROR;
208    int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
209
210    u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
211    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
212    if (re == NULL || refC == NULL || patBuf == NULL) {
213        *status = U_MEMORY_ALLOCATION_ERROR;
214        delete re;
215        uprv_free((void *)refC);
216        uprv_free(patBuf);
217        return NULL;
218    }
219    re->fPatRefCount = refC;
220    *re->fPatRefCount = 1;
221
222    //
223    // Make a copy of the pattern string, so we can return it later if asked.
224    //    For compiling the pattern, we will use a read-only UText wrapper
225    //    around this local copy, to avoid making even more copies.
226    //
227    re->fPatString    = patBuf;
228    re->fPatStringLen = pattern16Length;
229    utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
230
231    UText patText = UTEXT_INITIALIZER;
232    utext_openUChars(&patText, patBuf, pattern16Length, status);
233
234    //
235    // Compile the pattern
236    //
237    if (pe != NULL) {
238        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
239    } else {
240        re->fPat = RegexPattern::compile(&patText, flags, *status);
241    }
242    utext_close(&patText);
243
244    if (U_FAILURE(*status)) {
245        goto ErrorExit;
246    }
247
248    //
249    // Create the matcher object
250    //
251    re->fMatcher = re->fPat->matcher(*status);
252    if (U_SUCCESS(*status)) {
253        return (URegularExpression*)re;
254    }
255
256ErrorExit:
257    delete re;
258    return NULL;
259
260}
261
262//----------------------------------------------------------------------------------------
263//
264//    uregex_close
265//
266//----------------------------------------------------------------------------------------
267U_CAPI void  U_EXPORT2
268uregex_close(URegularExpression  *re2) {
269    RegularExpression *re = (RegularExpression*)re2;
270    UErrorCode  status = U_ZERO_ERROR;
271    if (validateRE(re, FALSE, &status) == FALSE) {
272        return;
273    }
274    delete re;
275}
276
277
278//----------------------------------------------------------------------------------------
279//
280//    uregex_clone
281//
282//----------------------------------------------------------------------------------------
283U_CAPI URegularExpression * U_EXPORT2
284uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
285    RegularExpression *source = (RegularExpression*)source2;
286    if (validateRE(source, FALSE, status) == FALSE) {
287        return NULL;
288    }
289
290    RegularExpression *clone = new RegularExpression;
291    if (clone == NULL) {
292        *status = U_MEMORY_ALLOCATION_ERROR;
293        return NULL;
294    }
295
296    clone->fMatcher = source->fPat->matcher(*status);
297    if (U_FAILURE(*status)) {
298        delete clone;
299        return NULL;
300    }
301
302    clone->fPat          = source->fPat;
303    clone->fPatRefCount  = source->fPatRefCount;
304    clone->fPatString    = source->fPatString;
305    clone->fPatStringLen = source->fPatStringLen;
306    umtx_atomic_inc(source->fPatRefCount);
307    // Note:  fText is not cloned.
308
309    return (URegularExpression*)clone;
310}
311
312
313
314
315//------------------------------------------------------------------------------
316//
317//    uregex_pattern
318//
319//------------------------------------------------------------------------------
320U_CAPI const UChar * U_EXPORT2
321uregex_pattern(const  URegularExpression *regexp2,
322                      int32_t            *patLength,
323                      UErrorCode         *status)  {
324    RegularExpression *regexp = (RegularExpression*)regexp2;
325
326    if (validateRE(regexp, FALSE, status) == FALSE) {
327        return NULL;
328    }
329    if (patLength != NULL) {
330        *patLength = regexp->fPatStringLen;
331    }
332    return regexp->fPatString;
333}
334
335
336//------------------------------------------------------------------------------
337//
338//    uregex_patternUText
339//
340//------------------------------------------------------------------------------
341U_CAPI UText * U_EXPORT2
342uregex_patternUText(const URegularExpression *regexp2,
343                          UErrorCode         *status)  {
344    RegularExpression *regexp = (RegularExpression*)regexp2;
345    return regexp->fPat->patternText(*status);
346}
347
348
349//------------------------------------------------------------------------------
350//
351//    uregex_flags
352//
353//------------------------------------------------------------------------------
354U_CAPI int32_t U_EXPORT2
355uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
356    RegularExpression *regexp = (RegularExpression*)regexp2;
357    if (validateRE(regexp, FALSE, status) == FALSE) {
358        return 0;
359    }
360    int32_t flags = regexp->fPat->flags();
361    return flags;
362}
363
364
365//------------------------------------------------------------------------------
366//
367//    uregex_setText
368//
369//------------------------------------------------------------------------------
370U_CAPI void U_EXPORT2
371uregex_setText(URegularExpression *regexp2,
372               const UChar        *text,
373               int32_t             textLength,
374               UErrorCode         *status)  {
375    RegularExpression *regexp = (RegularExpression*)regexp2;
376    if (validateRE(regexp, FALSE, status) == FALSE) {
377        return;
378    }
379    if (text == NULL || textLength < -1) {
380        *status = U_ILLEGAL_ARGUMENT_ERROR;
381        return;
382    }
383
384    if (regexp->fOwnsText && regexp->fText != NULL) {
385        uprv_free((void *)regexp->fText);
386    }
387
388    regexp->fText       = text;
389    regexp->fTextLength = textLength;
390    regexp->fOwnsText   = FALSE;
391
392    UText input = UTEXT_INITIALIZER;
393    utext_openUChars(&input, text, textLength, status);
394    regexp->fMatcher->reset(&input);
395    utext_close(&input); // reset() made a shallow clone, so we don't need this copy
396}
397
398
399//------------------------------------------------------------------------------
400//
401//    uregex_setUText
402//
403//------------------------------------------------------------------------------
404U_CAPI void U_EXPORT2
405uregex_setUText(URegularExpression *regexp2,
406                UText              *text,
407                UErrorCode         *status) {
408    RegularExpression *regexp = (RegularExpression*)regexp2;
409    if (validateRE(regexp, FALSE, status) == FALSE) {
410        return;
411    }
412    if (text == NULL) {
413        *status = U_ILLEGAL_ARGUMENT_ERROR;
414        return;
415    }
416
417    if (regexp->fOwnsText && regexp->fText != NULL) {
418        uprv_free((void *)regexp->fText);
419    }
420
421    regexp->fText       = NULL; // only fill it in on request
422    regexp->fTextLength = -1;
423    regexp->fOwnsText   = TRUE;
424    regexp->fMatcher->reset(text);
425}
426
427
428
429//------------------------------------------------------------------------------
430//
431//    uregex_getText
432//
433//------------------------------------------------------------------------------
434U_CAPI const UChar * U_EXPORT2
435uregex_getText(URegularExpression *regexp2,
436               int32_t            *textLength,
437               UErrorCode         *status)  {
438    RegularExpression *regexp = (RegularExpression*)regexp2;
439    if (validateRE(regexp, FALSE, status) == FALSE) {
440        return NULL;
441    }
442
443    if (regexp->fText == NULL) {
444        // need to fill in the text
445        UText *inputText = regexp->fMatcher->inputText();
446        int64_t inputNativeLength = utext_nativeLength(inputText);
447        if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
448            regexp->fText = inputText->chunkContents;
449            regexp->fTextLength = (int32_t)inputNativeLength;
450            regexp->fOwnsText = FALSE; // because the UText owns it
451        } else {
452            UErrorCode lengthStatus = U_ZERO_ERROR;
453            regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
454            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
455
456            utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
457            regexp->fText = inputChars;
458            regexp->fOwnsText = TRUE; // should already be set but just in case
459        }
460    }
461
462    if (textLength != NULL) {
463        *textLength = regexp->fTextLength;
464    }
465    return regexp->fText;
466}
467
468
469//------------------------------------------------------------------------------
470//
471//    uregex_getUText
472//
473//------------------------------------------------------------------------------
474U_CAPI UText * U_EXPORT2
475uregex_getUText(URegularExpression *regexp2,
476                UText              *dest,
477                UErrorCode         *status)  {
478    RegularExpression *regexp = (RegularExpression*)regexp2;
479    if (validateRE(regexp, FALSE, status) == FALSE) {
480        return dest;
481    }
482    return regexp->fMatcher->getInput(dest, *status);
483}
484
485
486//------------------------------------------------------------------------------
487//
488//    uregex_refreshUText
489//
490//------------------------------------------------------------------------------
491U_CAPI void U_EXPORT2
492uregex_refreshUText(URegularExpression *regexp2,
493                    UText              *text,
494                    UErrorCode         *status) {
495    RegularExpression *regexp = (RegularExpression*)regexp2;
496    if (validateRE(regexp, FALSE, status) == FALSE) {
497        return;
498    }
499    regexp->fMatcher->refreshInputText(text, *status);
500}
501
502
503//------------------------------------------------------------------------------
504//
505//    uregex_matches
506//
507//------------------------------------------------------------------------------
508U_CAPI UBool U_EXPORT2
509uregex_matches(URegularExpression *regexp2,
510               int32_t            startIndex,
511               UErrorCode        *status)  {
512    return uregex_matches64( regexp2, (int64_t)startIndex, status);
513}
514
515U_CAPI UBool U_EXPORT2
516uregex_matches64(URegularExpression *regexp2,
517                 int64_t            startIndex,
518                 UErrorCode        *status)  {
519    RegularExpression *regexp = (RegularExpression*)regexp2;
520    UBool result = FALSE;
521    if (validateRE(regexp, TRUE, status) == FALSE) {
522        return result;
523    }
524    if (startIndex == -1) {
525        result = regexp->fMatcher->matches(*status);
526    } else {
527        result = regexp->fMatcher->matches(startIndex, *status);
528    }
529    return result;
530}
531
532
533//------------------------------------------------------------------------------
534//
535//    uregex_lookingAt
536//
537//------------------------------------------------------------------------------
538U_CAPI UBool U_EXPORT2
539uregex_lookingAt(URegularExpression *regexp2,
540                 int32_t             startIndex,
541                 UErrorCode         *status)  {
542    return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
543}
544
545U_CAPI UBool U_EXPORT2
546uregex_lookingAt64(URegularExpression *regexp2,
547                   int64_t             startIndex,
548                   UErrorCode         *status)  {
549    RegularExpression *regexp = (RegularExpression*)regexp2;
550    UBool result = FALSE;
551    if (validateRE(regexp, TRUE, status) == FALSE) {
552        return result;
553    }
554    if (startIndex == -1) {
555        result = regexp->fMatcher->lookingAt(*status);
556    } else {
557        result = regexp->fMatcher->lookingAt(startIndex, *status);
558    }
559    return result;
560}
561
562
563
564//------------------------------------------------------------------------------
565//
566//    uregex_find
567//
568//------------------------------------------------------------------------------
569U_CAPI UBool U_EXPORT2
570uregex_find(URegularExpression *regexp2,
571            int32_t             startIndex,
572            UErrorCode         *status)  {
573    return uregex_find64( regexp2, (int64_t)startIndex, status);
574}
575
576U_CAPI UBool U_EXPORT2
577uregex_find64(URegularExpression *regexp2,
578              int64_t             startIndex,
579              UErrorCode         *status)  {
580    RegularExpression *regexp = (RegularExpression*)regexp2;
581    UBool result = FALSE;
582    if (validateRE(regexp, TRUE, status) == FALSE) {
583        return result;
584    }
585    if (startIndex == -1) {
586        regexp->fMatcher->resetPreserveRegion();
587        result = regexp->fMatcher->find();
588    } else {
589        result = regexp->fMatcher->find(startIndex, *status);
590    }
591    return result;
592}
593
594
595//------------------------------------------------------------------------------
596//
597//    uregex_findNext
598//
599//------------------------------------------------------------------------------
600U_CAPI UBool U_EXPORT2
601uregex_findNext(URegularExpression *regexp2,
602                UErrorCode         *status)  {
603    RegularExpression *regexp = (RegularExpression*)regexp2;
604    if (validateRE(regexp, TRUE, status) == FALSE) {
605        return FALSE;
606    }
607    UBool result = regexp->fMatcher->find();
608    return result;
609}
610
611//------------------------------------------------------------------------------
612//
613//    uregex_groupCount
614//
615//------------------------------------------------------------------------------
616U_CAPI int32_t U_EXPORT2
617uregex_groupCount(URegularExpression *regexp2,
618                  UErrorCode         *status)  {
619    RegularExpression *regexp = (RegularExpression*)regexp2;
620    if (validateRE(regexp, FALSE, status) == FALSE) {
621        return 0;
622    }
623    int32_t  result = regexp->fMatcher->groupCount();
624    return result;
625}
626
627
628//------------------------------------------------------------------------------
629//
630//    uregex_group
631//
632//------------------------------------------------------------------------------
633U_CAPI int32_t U_EXPORT2
634uregex_group(URegularExpression *regexp2,
635             int32_t             groupNum,
636             UChar              *dest,
637             int32_t             destCapacity,
638             UErrorCode          *status)  {
639    RegularExpression *regexp = (RegularExpression*)regexp2;
640    if (validateRE(regexp, TRUE, status) == FALSE) {
641        return 0;
642    }
643    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
644        *status = U_ILLEGAL_ARGUMENT_ERROR;
645        return 0;
646    }
647
648    if (destCapacity == 0 || regexp->fText != NULL) {
649        // If preflighting or if we already have the text as UChars,
650        // this is a little cheaper than going through uregex_groupUTextDeep()
651
652        //
653        // Pick up the range of characters from the matcher
654        //
655        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
656        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
657        if (U_FAILURE(*status)) {
658            return 0;
659        }
660
661        //
662        // Trim length based on buffer capacity
663        //
664        int32_t fullLength = endIx - startIx;
665        int32_t copyLength = fullLength;
666        if (copyLength < destCapacity) {
667            dest[copyLength] = 0;
668        } else if (copyLength == destCapacity) {
669            *status = U_STRING_NOT_TERMINATED_WARNING;
670        } else {
671            copyLength = destCapacity;
672            *status = U_BUFFER_OVERFLOW_ERROR;
673        }
674
675        //
676        // Copy capture group to user's buffer
677        //
678        if (copyLength > 0) {
679            u_memcpy(dest, &regexp->fText[startIx], copyLength);
680        }
681        return fullLength;
682    } else {
683        int32_t result = 0;
684        UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
685        if (U_SUCCESS(*status)) {
686            result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
687        }
688        utext_close(groupText);
689        return result;
690    }
691}
692
693
694//------------------------------------------------------------------------------
695//
696//    uregex_groupUText
697//
698//------------------------------------------------------------------------------
699U_CAPI UText * U_EXPORT2
700uregex_groupUText(URegularExpression *regexp2,
701                  int32_t             groupNum,
702                  UText              *dest,
703                  int64_t            *groupLength,
704                  UErrorCode         *status)  {
705    RegularExpression *regexp = (RegularExpression*)regexp2;
706    if (validateRE(regexp, TRUE, status) == FALSE) {
707        UErrorCode emptyTextStatus = U_ZERO_ERROR;
708        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
709    }
710
711    return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
712}
713
714//------------------------------------------------------------------------------
715//
716//    uregex_groupUTextDeep
717//
718//------------------------------------------------------------------------------
719U_CAPI UText * U_EXPORT2
720uregex_groupUTextDeep(URegularExpression *regexp2,
721                  int32_t             groupNum,
722                  UText              *dest,
723                  UErrorCode         *status)  {
724    RegularExpression *regexp = (RegularExpression*)regexp2;
725    if (validateRE(regexp, TRUE, status) == FALSE) {
726        UErrorCode emptyTextStatus = U_ZERO_ERROR;
727        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
728    }
729
730    if (regexp->fText != NULL) {
731        //
732        // Pick up the range of characters from the matcher
733        // and use our already-extracted characters
734        //
735        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
736        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
737        if (U_FAILURE(*status)) {
738            UErrorCode emptyTextStatus = U_ZERO_ERROR;
739            return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
740        }
741
742        if (dest) {
743            utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
744        } else {
745            UText groupText = UTEXT_INITIALIZER;
746            utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
747            dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
748            utext_close(&groupText);
749        }
750
751        return dest;
752    } else {
753        return regexp->fMatcher->group(groupNum, dest, *status);
754    }
755}
756
757//------------------------------------------------------------------------------
758//
759//    uregex_start
760//
761//------------------------------------------------------------------------------
762U_CAPI int32_t U_EXPORT2
763uregex_start(URegularExpression *regexp2,
764             int32_t             groupNum,
765             UErrorCode          *status)  {
766    return (int32_t)uregex_start64( regexp2, groupNum, status);
767}
768
769U_CAPI int64_t U_EXPORT2
770uregex_start64(URegularExpression *regexp2,
771               int32_t             groupNum,
772               UErrorCode          *status)  {
773    RegularExpression *regexp = (RegularExpression*)regexp2;
774    if (validateRE(regexp, TRUE, status) == FALSE) {
775        return 0;
776    }
777    int32_t result = regexp->fMatcher->start(groupNum, *status);
778    return result;
779}
780
781//------------------------------------------------------------------------------
782//
783//    uregex_end
784//
785//------------------------------------------------------------------------------
786U_CAPI int32_t U_EXPORT2
787uregex_end(URegularExpression   *regexp2,
788           int32_t               groupNum,
789           UErrorCode           *status)  {
790    return (int32_t)uregex_end64( regexp2, groupNum, status);
791}
792
793U_CAPI int64_t U_EXPORT2
794uregex_end64(URegularExpression   *regexp2,
795             int32_t               groupNum,
796             UErrorCode           *status)  {
797    RegularExpression *regexp = (RegularExpression*)regexp2;
798    if (validateRE(regexp, TRUE, status) == FALSE) {
799        return 0;
800    }
801    int32_t result = regexp->fMatcher->end(groupNum, *status);
802    return result;
803}
804
805//------------------------------------------------------------------------------
806//
807//    uregex_reset
808//
809//------------------------------------------------------------------------------
810U_CAPI void U_EXPORT2
811uregex_reset(URegularExpression    *regexp2,
812             int32_t               index,
813             UErrorCode            *status)  {
814    uregex_reset64( regexp2, (int64_t)index, status);
815}
816
817U_CAPI void U_EXPORT2
818uregex_reset64(URegularExpression    *regexp2,
819               int64_t               index,
820               UErrorCode            *status)  {
821    RegularExpression *regexp = (RegularExpression*)regexp2;
822    if (validateRE(regexp, TRUE, status) == FALSE) {
823        return;
824    }
825    regexp->fMatcher->reset(index, *status);
826}
827
828
829//------------------------------------------------------------------------------
830//
831//    uregex_setRegion
832//
833//------------------------------------------------------------------------------
834U_CAPI void U_EXPORT2
835uregex_setRegion(URegularExpression   *regexp2,
836                 int32_t               regionStart,
837                 int32_t               regionLimit,
838                 UErrorCode           *status)  {
839    uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
840}
841
842U_CAPI void U_EXPORT2
843uregex_setRegion64(URegularExpression   *regexp2,
844                   int64_t               regionStart,
845                   int64_t               regionLimit,
846                   UErrorCode           *status)  {
847    RegularExpression *regexp = (RegularExpression*)regexp2;
848    if (validateRE(regexp, TRUE, status) == FALSE) {
849        return;
850    }
851    regexp->fMatcher->region(regionStart, regionLimit, *status);
852}
853
854
855//------------------------------------------------------------------------------
856//
857//    uregex_setRegionAndStart
858//
859//------------------------------------------------------------------------------
860U_CAPI void U_EXPORT2
861uregex_setRegionAndStart(URegularExpression   *regexp2,
862                 int64_t               regionStart,
863                 int64_t               regionLimit,
864                 int64_t               startIndex,
865                 UErrorCode           *status)  {
866    RegularExpression *regexp = (RegularExpression*)regexp2;
867    if (validateRE(regexp, TRUE, status) == FALSE) {
868        return;
869    }
870    regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
871}
872
873//------------------------------------------------------------------------------
874//
875//    uregex_regionStart
876//
877//------------------------------------------------------------------------------
878U_CAPI int32_t U_EXPORT2
879uregex_regionStart(const  URegularExpression   *regexp2,
880                          UErrorCode           *status)  {
881    return (int32_t)uregex_regionStart64(regexp2, status);
882}
883
884U_CAPI int64_t U_EXPORT2
885uregex_regionStart64(const  URegularExpression   *regexp2,
886                            UErrorCode           *status)  {
887    RegularExpression *regexp = (RegularExpression*)regexp2;
888    if (validateRE(regexp, TRUE, status) == FALSE) {
889        return 0;
890    }
891    return regexp->fMatcher->regionStart();
892}
893
894
895//------------------------------------------------------------------------------
896//
897//    uregex_regionEnd
898//
899//------------------------------------------------------------------------------
900U_CAPI int32_t U_EXPORT2
901uregex_regionEnd(const  URegularExpression   *regexp2,
902                        UErrorCode           *status)  {
903    return (int32_t)uregex_regionEnd64(regexp2, status);
904}
905
906U_CAPI int64_t U_EXPORT2
907uregex_regionEnd64(const  URegularExpression   *regexp2,
908                          UErrorCode           *status)  {
909    RegularExpression *regexp = (RegularExpression*)regexp2;
910    if (validateRE(regexp, TRUE, status) == FALSE) {
911        return 0;
912    }
913    return regexp->fMatcher->regionEnd();
914}
915
916
917//------------------------------------------------------------------------------
918//
919//    uregex_hasTransparentBounds
920//
921//------------------------------------------------------------------------------
922U_CAPI UBool U_EXPORT2
923uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
924                                   UErrorCode           *status)  {
925    RegularExpression *regexp = (RegularExpression*)regexp2;
926    if (validateRE(regexp, FALSE, status) == FALSE) {
927        return FALSE;
928    }
929    return regexp->fMatcher->hasTransparentBounds();
930}
931
932
933//------------------------------------------------------------------------------
934//
935//    uregex_useTransparentBounds
936//
937//------------------------------------------------------------------------------
938U_CAPI void U_EXPORT2
939uregex_useTransparentBounds(URegularExpression    *regexp2,
940                            UBool                  b,
941                            UErrorCode            *status)  {
942    RegularExpression *regexp = (RegularExpression*)regexp2;
943    if (validateRE(regexp, FALSE, status) == FALSE) {
944        return;
945    }
946    regexp->fMatcher->useTransparentBounds(b);
947}
948
949
950//------------------------------------------------------------------------------
951//
952//    uregex_hasAnchoringBounds
953//
954//------------------------------------------------------------------------------
955U_CAPI UBool U_EXPORT2
956uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
957                                 UErrorCode           *status)  {
958    RegularExpression *regexp = (RegularExpression*)regexp2;
959    if (validateRE(regexp, FALSE, status) == FALSE) {
960        return FALSE;
961    }
962    return regexp->fMatcher->hasAnchoringBounds();
963}
964
965
966//------------------------------------------------------------------------------
967//
968//    uregex_useAnchoringBounds
969//
970//------------------------------------------------------------------------------
971U_CAPI void U_EXPORT2
972uregex_useAnchoringBounds(URegularExpression    *regexp2,
973                          UBool                  b,
974                          UErrorCode            *status)  {
975    RegularExpression *regexp = (RegularExpression*)regexp2;
976    if (validateRE(regexp, FALSE, status) == FALSE) {
977        return;
978    }
979    regexp->fMatcher->useAnchoringBounds(b);
980}
981
982
983//------------------------------------------------------------------------------
984//
985//    uregex_hitEnd
986//
987//------------------------------------------------------------------------------
988U_CAPI UBool U_EXPORT2
989uregex_hitEnd(const  URegularExpression   *regexp2,
990                     UErrorCode           *status)  {
991    RegularExpression *regexp = (RegularExpression*)regexp2;
992    if (validateRE(regexp, TRUE, status) == FALSE) {
993        return FALSE;
994    }
995    return regexp->fMatcher->hitEnd();
996}
997
998
999//------------------------------------------------------------------------------
1000//
1001//    uregex_requireEnd
1002//
1003//------------------------------------------------------------------------------
1004U_CAPI UBool U_EXPORT2
1005uregex_requireEnd(const  URegularExpression   *regexp2,
1006                         UErrorCode           *status)  {
1007    RegularExpression *regexp = (RegularExpression*)regexp2;
1008    if (validateRE(regexp, TRUE, status) == FALSE) {
1009        return FALSE;
1010    }
1011    return regexp->fMatcher->requireEnd();
1012}
1013
1014
1015//------------------------------------------------------------------------------
1016//
1017//    uregex_setTimeLimit
1018//
1019//------------------------------------------------------------------------------
1020U_CAPI void U_EXPORT2
1021uregex_setTimeLimit(URegularExpression   *regexp2,
1022                    int32_t               limit,
1023                    UErrorCode           *status) {
1024    RegularExpression *regexp = (RegularExpression*)regexp2;
1025    if (validateRE(regexp, FALSE, status)) {
1026        regexp->fMatcher->setTimeLimit(limit, *status);
1027    }
1028}
1029
1030
1031
1032//------------------------------------------------------------------------------
1033//
1034//    uregex_getTimeLimit
1035//
1036//------------------------------------------------------------------------------
1037U_CAPI int32_t U_EXPORT2
1038uregex_getTimeLimit(const  URegularExpression   *regexp2,
1039                           UErrorCode           *status) {
1040    int32_t retVal = 0;
1041    RegularExpression *regexp = (RegularExpression*)regexp2;
1042    if (validateRE(regexp, FALSE, status)) {
1043        retVal = regexp->fMatcher->getTimeLimit();
1044    }
1045    return retVal;
1046}
1047
1048
1049
1050//------------------------------------------------------------------------------
1051//
1052//    uregex_setStackLimit
1053//
1054//------------------------------------------------------------------------------
1055U_CAPI void U_EXPORT2
1056uregex_setStackLimit(URegularExpression   *regexp2,
1057                     int32_t               limit,
1058                     UErrorCode           *status) {
1059    RegularExpression *regexp = (RegularExpression*)regexp2;
1060    if (validateRE(regexp, FALSE, status)) {
1061        regexp->fMatcher->setStackLimit(limit, *status);
1062    }
1063}
1064
1065
1066
1067//------------------------------------------------------------------------------
1068//
1069//    uregex_getStackLimit
1070//
1071//------------------------------------------------------------------------------
1072U_CAPI int32_t U_EXPORT2
1073uregex_getStackLimit(const  URegularExpression   *regexp2,
1074                            UErrorCode           *status) {
1075    int32_t retVal = 0;
1076    RegularExpression *regexp = (RegularExpression*)regexp2;
1077    if (validateRE(regexp, FALSE, status)) {
1078        retVal = regexp->fMatcher->getStackLimit();
1079    }
1080    return retVal;
1081}
1082
1083
1084//------------------------------------------------------------------------------
1085//
1086//    uregex_setMatchCallback
1087//
1088//------------------------------------------------------------------------------
1089U_CAPI void U_EXPORT2
1090uregex_setMatchCallback(URegularExpression      *regexp2,
1091                        URegexMatchCallback     *callback,
1092                        const void              *context,
1093                        UErrorCode              *status) {
1094    RegularExpression *regexp = (RegularExpression*)regexp2;
1095    if (validateRE(regexp, FALSE, status)) {
1096        regexp->fMatcher->setMatchCallback(callback, context, *status);
1097    }
1098}
1099
1100
1101//------------------------------------------------------------------------------
1102//
1103//    uregex_getMatchCallback
1104//
1105//------------------------------------------------------------------------------
1106U_CAPI void U_EXPORT2
1107uregex_getMatchCallback(const URegularExpression    *regexp2,
1108                        URegexMatchCallback        **callback,
1109                        const void                 **context,
1110                        UErrorCode                  *status) {
1111    RegularExpression *regexp = (RegularExpression*)regexp2;
1112     if (validateRE(regexp, FALSE, status)) {
1113         regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1114     }
1115}
1116
1117
1118//------------------------------------------------------------------------------
1119//
1120//    uregex_setMatchProgressCallback
1121//
1122//------------------------------------------------------------------------------
1123U_CAPI void U_EXPORT2
1124uregex_setFindProgressCallback(URegularExpression              *regexp2,
1125                                URegexFindProgressCallback      *callback,
1126                                const void                      *context,
1127                                UErrorCode                      *status) {
1128    RegularExpression *regexp = (RegularExpression*)regexp2;
1129    if (validateRE(regexp, FALSE, status)) {
1130        regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1131    }
1132}
1133
1134
1135//------------------------------------------------------------------------------
1136//
1137//    uregex_getMatchCallback
1138//
1139//------------------------------------------------------------------------------
1140U_CAPI void U_EXPORT2
1141uregex_getFindProgressCallback(const URegularExpression          *regexp2,
1142                                URegexFindProgressCallback        **callback,
1143                                const void                        **context,
1144                                UErrorCode                        *status) {
1145    RegularExpression *regexp = (RegularExpression*)regexp2;
1146     if (validateRE(regexp, FALSE, status)) {
1147         regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1148     }
1149}
1150
1151
1152//------------------------------------------------------------------------------
1153//
1154//    uregex_replaceAll
1155//
1156//------------------------------------------------------------------------------
1157U_CAPI int32_t U_EXPORT2
1158uregex_replaceAll(URegularExpression    *regexp2,
1159                  const UChar           *replacementText,
1160                  int32_t                replacementLength,
1161                  UChar                 *destBuf,
1162                  int32_t                destCapacity,
1163                  UErrorCode            *status)  {
1164    RegularExpression *regexp = (RegularExpression*)regexp2;
1165    if (validateRE(regexp, TRUE, status) == FALSE) {
1166        return 0;
1167    }
1168    if (replacementText == NULL || replacementLength < -1 ||
1169        (destBuf == NULL && destCapacity > 0) ||
1170        destCapacity < 0) {
1171        *status = U_ILLEGAL_ARGUMENT_ERROR;
1172        return 0;
1173    }
1174
1175    int32_t   len = 0;
1176
1177    uregex_reset(regexp2, 0, status);
1178
1179    // Note: Seperate error code variables for findNext() and appendReplacement()
1180    //       are used so that destination buffer overflow errors
1181    //       in appendReplacement won't stop findNext() from working.
1182    //       appendReplacement() and appendTail() special case incoming buffer
1183    //       overflow errors, continuing to return the correct length.
1184    UErrorCode  findStatus = *status;
1185    while (uregex_findNext(regexp2, &findStatus)) {
1186        len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1187                                        &destBuf, &destCapacity, status);
1188    }
1189    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1190
1191    if (U_FAILURE(findStatus)) {
1192        // If anything went wrong with the findNext(), make that error trump
1193        //   whatever may have happened with the append() operations.
1194        //   Errors in findNext() are not expected.
1195        *status = findStatus;
1196    }
1197
1198    return len;
1199}
1200
1201
1202//------------------------------------------------------------------------------
1203//
1204//    uregex_replaceAllUText
1205//
1206//------------------------------------------------------------------------------
1207U_CAPI UText * U_EXPORT2
1208uregex_replaceAllUText(URegularExpression    *regexp2,
1209                       UText                 *replacementText,
1210                       UText                 *dest,
1211                       UErrorCode            *status)  {
1212    RegularExpression *regexp = (RegularExpression*)regexp2;
1213    if (validateRE(regexp, TRUE, status) == FALSE) {
1214        return 0;
1215    }
1216    if (replacementText == NULL) {
1217        *status = U_ILLEGAL_ARGUMENT_ERROR;
1218        return 0;
1219    }
1220
1221    dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1222    return dest;
1223}
1224
1225
1226//------------------------------------------------------------------------------
1227//
1228//    uregex_replaceFirst
1229//
1230//------------------------------------------------------------------------------
1231U_CAPI int32_t U_EXPORT2
1232uregex_replaceFirst(URegularExpression  *regexp2,
1233                    const UChar         *replacementText,
1234                    int32_t              replacementLength,
1235                    UChar               *destBuf,
1236                    int32_t              destCapacity,
1237                    UErrorCode          *status)  {
1238    RegularExpression *regexp = (RegularExpression*)regexp2;
1239    if (validateRE(regexp, TRUE, status) == FALSE) {
1240        return 0;
1241    }
1242    if (replacementText == NULL || replacementLength < -1 ||
1243        (destBuf == NULL && destCapacity > 0) ||
1244        destCapacity < 0) {
1245        *status = U_ILLEGAL_ARGUMENT_ERROR;
1246        return 0;
1247    }
1248
1249    int32_t   len = 0;
1250    UBool     findSucceeded;
1251    uregex_reset(regexp2, 0, status);
1252    findSucceeded = uregex_find(regexp2, 0, status);
1253    if (findSucceeded) {
1254        len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1255                                       &destBuf, &destCapacity, status);
1256    }
1257    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1258
1259    return len;
1260}
1261
1262
1263//------------------------------------------------------------------------------
1264//
1265//    uregex_replaceFirstUText
1266//
1267//------------------------------------------------------------------------------
1268U_CAPI UText * U_EXPORT2
1269uregex_replaceFirstUText(URegularExpression  *regexp2,
1270                         UText                 *replacementText,
1271                         UText                 *dest,
1272                         UErrorCode            *status)  {
1273    RegularExpression *regexp = (RegularExpression*)regexp2;
1274    if (validateRE(regexp, TRUE, status) == FALSE) {
1275        return 0;
1276    }
1277    if (replacementText == NULL) {
1278        *status = U_ILLEGAL_ARGUMENT_ERROR;
1279        return 0;
1280    }
1281
1282    dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1283    return dest;
1284}
1285
1286
1287//------------------------------------------------------------------------------
1288//
1289//    uregex_appendReplacement
1290//
1291//------------------------------------------------------------------------------
1292
1293U_NAMESPACE_BEGIN
1294//
1295//  Dummy class, because these functions need to be friends of class RegexMatcher,
1296//               and stand-alone C functions don't work as friends
1297//
1298class RegexCImpl {
1299 public:
1300   inline static  int32_t appendReplacement(RegularExpression    *regexp,
1301                      const UChar           *replacementText,
1302                      int32_t                replacementLength,
1303                      UChar                **destBuf,
1304                      int32_t               *destCapacity,
1305                      UErrorCode            *status);
1306
1307   inline static int32_t appendTail(RegularExpression    *regexp,
1308        UChar                **destBuf,
1309        int32_t               *destCapacity,
1310        UErrorCode            *status);
1311
1312    inline static int32_t split(RegularExpression    *regexp,
1313        UChar                 *destBuf,
1314        int32_t                destCapacity,
1315        int32_t               *requiredCapacity,
1316        UChar                 *destFields[],
1317        int32_t                destFieldsCapacity,
1318        UErrorCode            *status);
1319};
1320
1321U_NAMESPACE_END
1322
1323
1324
1325static const UChar BACKSLASH  = 0x5c;
1326static const UChar DOLLARSIGN = 0x24;
1327
1328//
1329//  Move a character to an output buffer, with bounds checking on the index.
1330//      Index advances even if capacity is exceeded, for preflight size computations.
1331//      This little sequence is used a LOT.
1332//
1333static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1334    if (*idx < bufCapacity) {
1335        buf[*idx] = c;
1336    }
1337    (*idx)++;
1338}
1339
1340
1341//
1342//  appendReplacement, the actual implementation.
1343//
1344int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
1345                                      const UChar           *replacementText,
1346                                      int32_t                replacementLength,
1347                                      UChar                **destBuf,
1348                                      int32_t               *destCapacity,
1349                                      UErrorCode            *status)  {
1350
1351    // If we come in with a buffer overflow error, don't suppress the operation.
1352    //  A series of appendReplacements, appendTail need to correctly preflight
1353    //  the buffer size when an overflow happens somewhere in the middle.
1354    UBool pendingBufferOverflow = FALSE;
1355    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1356        pendingBufferOverflow = TRUE;
1357        *status = U_ZERO_ERROR;
1358    }
1359
1360    //
1361    // Validate all paramters
1362    //
1363    if (validateRE(regexp, TRUE, status) == FALSE) {
1364        return 0;
1365    }
1366    if (replacementText == NULL || replacementLength < -1 ||
1367        destCapacity == NULL || destBuf == NULL ||
1368        (*destBuf == NULL && *destCapacity > 0) ||
1369        *destCapacity < 0) {
1370        *status = U_ILLEGAL_ARGUMENT_ERROR;
1371        return 0;
1372    }
1373
1374    RegexMatcher *m = regexp->fMatcher;
1375    if (m->fMatch == FALSE) {
1376        *status = U_REGEX_INVALID_STATE;
1377        return 0;
1378    }
1379
1380    UChar    *dest             = *destBuf;
1381    int32_t   capacity         = *destCapacity;
1382    int32_t   destIdx          =  0;
1383    int32_t   i;
1384
1385    // If it wasn't supplied by the caller,  get the length of the replacement text.
1386    //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
1387    //          the fly and avoid this step.
1388    if (replacementLength == -1) {
1389        replacementLength = u_strlen(replacementText);
1390    }
1391
1392    // Copy input string from the end of previous match to start of current match
1393    if (regexp->fText != NULL) {
1394        int32_t matchStart;
1395        int32_t lastMatchEnd;
1396        if (UTEXT_USES_U16(m->fInputText)) {
1397            lastMatchEnd = (int32_t)m->fLastMatchEnd;
1398            matchStart = (int32_t)m->fMatchStart;
1399        } else {
1400            // !!!: Would like a better way to do this!
1401            UErrorCode status = U_ZERO_ERROR;
1402            lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
1403            status = U_ZERO_ERROR;
1404            matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
1405        }
1406        for (i=lastMatchEnd; i<matchStart; i++) {
1407            appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1408        }
1409    } else {
1410        UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1411        destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1412                                 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
1413                                 &possibleOverflowError);
1414    }
1415    U_ASSERT(destIdx >= 0);
1416
1417    // scan the replacement text, looking for substitutions ($n) and \escapes.
1418    int32_t  replIdx = 0;
1419    while (replIdx < replacementLength) {
1420        UChar  c = replacementText[replIdx];
1421        replIdx++;
1422        if (c != DOLLARSIGN && c != BACKSLASH) {
1423            // Common case, no substitution, no escaping,
1424            //  just copy the char to the dest buf.
1425            appendToBuf(c, &destIdx, dest, capacity);
1426            continue;
1427        }
1428
1429        if (c == BACKSLASH) {
1430            // Backslash Escape.  Copy the following char out without further checks.
1431            //                    Note:  Surrogate pairs don't need any special handling
1432            //                           The second half wont be a '$' or a '\', and
1433            //                           will move to the dest normally on the next
1434            //                           loop iteration.
1435            if (replIdx >= replacementLength) {
1436                break;
1437            }
1438            c = replacementText[replIdx];
1439
1440            if (c==0x55/*U*/ || c==0x75/*u*/) {
1441                // We have a \udddd or \Udddddddd escape sequence.
1442                UChar32 escapedChar =
1443                    u_unescapeAt(uregex_ucstr_unescape_charAt,
1444                       &replIdx,                   // Index is updated by unescapeAt
1445                       replacementLength,          // Length of replacement text
1446                       (void *)replacementText);
1447
1448                if (escapedChar != (UChar32)0xFFFFFFFF) {
1449                    if (escapedChar <= 0xffff) {
1450                        appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1451                    } else {
1452                        appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1453                        appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1454                    }
1455                    continue;
1456                }
1457                // Note:  if the \u escape was invalid, just fall through and
1458                //        treat it as a plain \<anything> escape.
1459            }
1460
1461            // Plain backslash escape.  Just put out the escaped character.
1462            appendToBuf(c, &destIdx, dest, capacity);
1463
1464            replIdx++;
1465            continue;
1466        }
1467
1468
1469
1470        // We've got a $.  Pick up a capture group number if one follows.
1471        // Consume at most the number of digits necessary for the largest capture
1472        // number that is valid for this pattern.
1473
1474        int32_t numDigits = 0;
1475        int32_t groupNum  = 0;
1476        UChar32 digitC;
1477        for (;;) {
1478            if (replIdx >= replacementLength) {
1479                break;
1480            }
1481            U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
1482            if (u_isdigit(digitC) == FALSE) {
1483                break;
1484            }
1485
1486            U16_FWD_1(replacementText, replIdx, replacementLength);
1487            groupNum=groupNum*10 + u_charDigitValue(digitC);
1488            numDigits++;
1489            if (numDigits >= m->fPattern->fMaxCaptureDigits) {
1490                break;
1491            }
1492        }
1493
1494
1495        if (numDigits == 0) {
1496            // The $ didn't introduce a group number at all.
1497            // Treat it as just part of the substitution text.
1498            appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
1499            continue;
1500        }
1501
1502        // Finally, append the capture group data to the destination.
1503        destIdx += uregex_group((URegularExpression*)regexp, groupNum,
1504                                dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1505        if (*status == U_BUFFER_OVERFLOW_ERROR) {
1506            // Ignore buffer overflow when extracting the group.  We need to
1507            //   continue on to get full size of the untruncated result.  We will
1508            //   raise our own buffer overflow error at the end.
1509            *status = U_ZERO_ERROR;
1510        }
1511
1512        if (U_FAILURE(*status)) {
1513            // Can fail if group number is out of range.
1514            break;
1515        }
1516
1517    }
1518
1519    //
1520    //  Nul Terminate the dest buffer if possible.
1521    //  Set the appropriate buffer overflow or not terminated error, if needed.
1522    //
1523    if (destIdx < capacity) {
1524        dest[destIdx] = 0;
1525    } else if (destIdx == *destCapacity) {
1526        *status = U_STRING_NOT_TERMINATED_WARNING;
1527    } else {
1528        *status = U_BUFFER_OVERFLOW_ERROR;
1529    }
1530
1531    //
1532    // Return an updated dest buffer and capacity to the caller.
1533    //
1534    if (destIdx > 0 &&  *destCapacity > 0) {
1535        if (destIdx < capacity) {
1536            *destBuf      += destIdx;
1537            *destCapacity -= destIdx;
1538        } else {
1539            *destBuf      += capacity;
1540            *destCapacity =  0;
1541        }
1542    }
1543
1544    // If we came in with a buffer overflow, make sure we go out with one also.
1545    //   (A zero length match right at the end of the previous match could
1546    //    make this function succeed even though a previous call had overflowed the buf)
1547    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1548        *status = U_BUFFER_OVERFLOW_ERROR;
1549    }
1550
1551    return destIdx;
1552}
1553
1554//
1555//   appendReplacement   the actual API function,
1556//
1557U_CAPI int32_t U_EXPORT2
1558uregex_appendReplacement(URegularExpression    *regexp2,
1559                         const UChar           *replacementText,
1560                         int32_t                replacementLength,
1561                         UChar                **destBuf,
1562                         int32_t               *destCapacity,
1563                         UErrorCode            *status) {
1564
1565    RegularExpression *regexp = (RegularExpression*)regexp2;
1566    return RegexCImpl::appendReplacement(
1567        regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1568}
1569
1570//
1571//   uregex_appendReplacementUText...can just use the normal C++ method
1572//
1573U_CAPI void U_EXPORT2
1574uregex_appendReplacementUText(URegularExpression    *regexp2,
1575                              UText                 *replText,
1576                              UText                 *dest,
1577                              UErrorCode            *status)  {
1578    RegularExpression *regexp = (RegularExpression*)regexp2;
1579    regexp->fMatcher->appendReplacement(dest, replText, *status);
1580}
1581
1582
1583//------------------------------------------------------------------------------
1584//
1585//    uregex_appendTail
1586//
1587//------------------------------------------------------------------------------
1588int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
1589                               UChar                **destBuf,
1590                               int32_t               *destCapacity,
1591                               UErrorCode            *status)
1592{
1593
1594    // If we come in with a buffer overflow error, don't suppress the operation.
1595    //  A series of appendReplacements, appendTail need to correctly preflight
1596    //  the buffer size when an overflow happens somewhere in the middle.
1597    UBool pendingBufferOverflow = FALSE;
1598    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1599        pendingBufferOverflow = TRUE;
1600        *status = U_ZERO_ERROR;
1601    }
1602
1603    if (validateRE(regexp, TRUE, status) == FALSE) {
1604        return 0;
1605    }
1606
1607    if (destCapacity == NULL || destBuf == NULL ||
1608        (*destBuf == NULL && *destCapacity > 0) ||
1609        *destCapacity < 0)
1610    {
1611        *status = U_ILLEGAL_ARGUMENT_ERROR;
1612        return 0;
1613    }
1614
1615    RegexMatcher *m = regexp->fMatcher;
1616
1617    int32_t  destIdx     = 0;
1618    int32_t  destCap     = *destCapacity;
1619    UChar    *dest       = *destBuf;
1620
1621    if (regexp->fText != NULL) {
1622        int32_t srcIdx;
1623        int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1624        if (nativeIdx == -1) {
1625            srcIdx = 0;
1626        } else if (UTEXT_USES_U16(m->fInputText)) {
1627            srcIdx = (int32_t)nativeIdx;
1628        } else {
1629            UErrorCode status = U_ZERO_ERROR;
1630            srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1631        }
1632
1633        for (;;) {
1634            U_ASSERT(destIdx >= 0);
1635
1636            if (srcIdx == regexp->fTextLength) {
1637                break;
1638            }
1639            UChar c = regexp->fText[srcIdx];
1640            if (c == 0 && regexp->fTextLength == -1) {
1641                regexp->fTextLength = srcIdx;
1642                break;
1643            }
1644
1645            if (destIdx < destCap) {
1646                dest[destIdx] = c;
1647            } else {
1648                // We've overflowed the dest buffer.
1649                //  If the total input string length is known, we can
1650                //    compute the total buffer size needed without scanning through the string.
1651                if (regexp->fTextLength > 0) {
1652                    destIdx += (regexp->fTextLength - srcIdx);
1653                    break;
1654                }
1655            }
1656            srcIdx++;
1657            destIdx++;
1658        }
1659    } else {
1660        int64_t  srcIdx;
1661        if (m->fMatch) {
1662            // The most recent call to find() succeeded.
1663            srcIdx = m->fMatchEnd;
1664        } else {
1665            // The last call to find() on this matcher failed().
1666            //   Look back to the end of the last find() that succeeded for src index.
1667            srcIdx = m->fLastMatchEnd;
1668            if (srcIdx == -1)  {
1669                // There has been no successful match with this matcher.
1670                //   We want to copy the whole string.
1671                srcIdx = 0;
1672            }
1673        }
1674
1675        destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1676    }
1677
1678    //
1679    //  NUL terminate the output string, if possible, otherwise issue the
1680    //   appropriate error or warning.
1681    //
1682    if (destIdx < destCap) {
1683        dest[destIdx] = 0;
1684    } else  if (destIdx == destCap) {
1685        *status = U_STRING_NOT_TERMINATED_WARNING;
1686    } else {
1687        *status = U_BUFFER_OVERFLOW_ERROR;
1688    }
1689
1690    //
1691    // Update the user's buffer ptr and capacity vars to reflect the
1692    //   amount used.
1693    //
1694    if (destIdx < destCap) {
1695        *destBuf      += destIdx;
1696        *destCapacity -= destIdx;
1697    } else if (*destBuf != NULL) {
1698        *destBuf      += destCap;
1699        *destCapacity  = 0;
1700    }
1701
1702    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1703        *status = U_BUFFER_OVERFLOW_ERROR;
1704    }
1705
1706    return destIdx;
1707}
1708
1709
1710//
1711//   appendTail   the actual API function
1712//
1713U_CAPI int32_t U_EXPORT2
1714uregex_appendTail(URegularExpression    *regexp2,
1715                  UChar                **destBuf,
1716                  int32_t               *destCapacity,
1717                  UErrorCode            *status)  {
1718    RegularExpression *regexp = (RegularExpression*)regexp2;
1719    return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1720}
1721
1722
1723//
1724//   uregex_appendTailUText...can just use the normal C++ method
1725//
1726U_CAPI UText * U_EXPORT2
1727uregex_appendTailUText(URegularExpression    *regexp2,
1728                       UText                 *dest,
1729                       UErrorCode            *status)  {
1730    RegularExpression *regexp = (RegularExpression*)regexp2;
1731    return regexp->fMatcher->appendTail(dest, *status);
1732}
1733
1734
1735//------------------------------------------------------------------------------
1736//
1737//    copyString     Internal utility to copy a string to an output buffer,
1738//                   while managing buffer overflow and preflight size
1739//                   computation.  NUL termination is added to destination,
1740//                   and the NUL is counted in the output size.
1741//
1742//------------------------------------------------------------------------------
1743#if 0
1744static void copyString(UChar        *destBuffer,    //  Destination buffer.
1745                       int32_t       destCapacity,  //  Total capacity of dest buffer
1746                       int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
1747                                                    //    Update not clipped to destCapacity.
1748                       const UChar  *srcPtr,        //  Pointer to source string
1749                       int32_t       srcLen)        //  Source string len.
1750{
1751    int32_t  si;
1752    int32_t  di = *destIndex;
1753    UChar    c;
1754
1755    for (si=0; si<srcLen;  si++) {
1756        c = srcPtr[si];
1757        if (di < destCapacity) {
1758            destBuffer[di] = c;
1759            di++;
1760        } else {
1761            di += srcLen - si;
1762            break;
1763        }
1764    }
1765    if (di<destCapacity) {
1766        destBuffer[di] = 0;
1767    }
1768    di++;
1769    *destIndex = di;
1770}
1771#endif
1772
1773//------------------------------------------------------------------------------
1774//
1775//    uregex_split
1776//
1777//------------------------------------------------------------------------------
1778int32_t RegexCImpl::split(RegularExpression     *regexp,
1779                          UChar                 *destBuf,
1780                          int32_t                destCapacity,
1781                          int32_t               *requiredCapacity,
1782                          UChar                 *destFields[],
1783                          int32_t                destFieldsCapacity,
1784                          UErrorCode            *status) {
1785    //
1786    // Reset for the input text
1787    //
1788    regexp->fMatcher->reset();
1789    UText *inputText = regexp->fMatcher->fInputText;
1790    int64_t   nextOutputStringStart = 0;
1791    int64_t   inputLen = regexp->fMatcher->fInputLength;
1792    if (inputLen == 0) {
1793        return 0;
1794    }
1795
1796    //
1797    // Loop through the input text, searching for the delimiter pattern
1798    //
1799    int32_t   i;             // Index of the field being processed.
1800    int32_t   destIdx = 0;   // Next available position in destBuf;
1801    int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
1802    UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
1803    for (i=0; ; i++) {
1804        if (i>=destFieldsCapacity-1) {
1805            // There are one or zero output strings left.
1806            // Fill the last output string with whatever is left from the input, then exit the loop.
1807            //  ( i will be == destFieldsCapacity if we filled the output array while processing
1808            //    capture groups of the delimiter expression, in which case we will discard the
1809            //    last capture group saved in favor of the unprocessed remainder of the
1810            //    input string.)
1811            if (inputLen > nextOutputStringStart) {
1812                if (i != destFieldsCapacity-1) {
1813                    // No fields are left.  Recycle the last one for holding the trailing part of
1814                    //   the input string.
1815                    i = destFieldsCapacity-1;
1816                    destIdx = (int32_t)(destFields[i] - destFields[0]);
1817                }
1818
1819                destFields[i] = &destBuf[destIdx];
1820                destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1821                                             &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1822            }
1823            break;
1824        }
1825
1826        if (regexp->fMatcher->find()) {
1827            // We found another delimiter.  Move everything from where we started looking
1828            //  up until the start of the delimiter into the next output string.
1829            destFields[i] = &destBuf[destIdx];
1830
1831            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1832                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1833            if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1834                tStatus = U_ZERO_ERROR;
1835            } else {
1836                *status = tStatus;
1837            }
1838            nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1839
1840            // If the delimiter pattern has capturing parentheses, the captured
1841            //  text goes out into the next n destination strings.
1842            int32_t groupNum;
1843            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1844                // If we've run out of output string slots, bail out.
1845                if (i==destFieldsCapacity-1) {
1846                    break;
1847                }
1848                i++;
1849
1850                // Set up to extract the capture group contents into the dest buffer.
1851                destFields[i] = &destBuf[destIdx];
1852                tStatus = U_ZERO_ERROR;
1853                int32_t t = uregex_group((URegularExpression*)regexp,
1854                                         groupNum,
1855                                         destFields[i],
1856                                         REMAINING_CAPACITY(destIdx, destCapacity),
1857                                         &tStatus);
1858                destIdx += t + 1;    // Record the space used in the output string buffer.
1859                                     //  +1 for the NUL that terminates the string.
1860                if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1861                    tStatus = U_ZERO_ERROR;
1862                } else {
1863                    *status = tStatus;
1864                }
1865            }
1866
1867            if (nextOutputStringStart == inputLen) {
1868                // The delimiter was at the end of the string.
1869                // Output an empty string, and then we are done.
1870                if (destIdx < destCapacity) {
1871                    destBuf[destIdx] = 0;
1872                }
1873                if (i < destFieldsCapacity-1) {
1874                   ++i;
1875                }
1876                if (destIdx < destCapacity) {
1877                    destFields[i] = destBuf + destIdx;
1878                }
1879                ++destIdx;
1880                break;
1881            }
1882
1883        }
1884        else
1885        {
1886            // We ran off the end of the input while looking for the next delimiter.
1887            // All the remaining text goes into the current output string.
1888            destFields[i] = &destBuf[destIdx];
1889            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1890                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1891            break;
1892        }
1893    }
1894
1895    // Zero out any unused portion of the destFields array
1896    int j;
1897    for (j=i+1; j<destFieldsCapacity; j++) {
1898        destFields[j] = NULL;
1899    }
1900
1901    if (requiredCapacity != NULL) {
1902        *requiredCapacity = destIdx;
1903    }
1904    if (destIdx > destCapacity) {
1905        *status = U_BUFFER_OVERFLOW_ERROR;
1906    }
1907    return i+1;
1908}
1909
1910//
1911//   uregex_split   The actual API function
1912//
1913U_CAPI int32_t U_EXPORT2
1914uregex_split(URegularExpression      *regexp2,
1915             UChar                   *destBuf,
1916             int32_t                  destCapacity,
1917             int32_t                 *requiredCapacity,
1918             UChar                   *destFields[],
1919             int32_t                  destFieldsCapacity,
1920             UErrorCode              *status) {
1921    RegularExpression *regexp = (RegularExpression*)regexp2;
1922    if (validateRE(regexp, TRUE, status) == FALSE) {
1923        return 0;
1924    }
1925    if ((destBuf == NULL && destCapacity > 0) ||
1926        destCapacity < 0 ||
1927        destFields == NULL ||
1928        destFieldsCapacity < 1 ) {
1929        *status = U_ILLEGAL_ARGUMENT_ERROR;
1930        return 0;
1931    }
1932
1933    return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1934}
1935
1936
1937//
1938//   uregex_splitUText...can just use the normal C++ method
1939//
1940U_CAPI int32_t U_EXPORT2
1941uregex_splitUText(URegularExpression    *regexp2,
1942                  UText                 *destFields[],
1943                  int32_t                destFieldsCapacity,
1944                  UErrorCode            *status) {
1945    RegularExpression *regexp = (RegularExpression*)regexp2;
1946    return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1947}
1948
1949
1950#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1951
1952