1/*
2*******************************************************************************
3*   Copyright (C) 2004-2013, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  uregex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "unicode/utf16.h"
20#include "umutex.h"
21#include "uassert.h"
22#include "cmemory.h"
23
24#include "regextxt.h"
25
26#include <stdio.h>
27
28U_NAMESPACE_BEGIN
29
30#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
31
32struct RegularExpression: public UMemory {
33public:
34    RegularExpression();
35    ~RegularExpression();
36    int32_t           fMagic;
37    RegexPattern     *fPat;
38    u_atomic_int32_t *fPatRefCount;
39    UChar            *fPatString;
40    int32_t           fPatStringLen;
41    RegexMatcher     *fMatcher;
42    const UChar      *fText;         // Text from setText()
43    int32_t           fTextLength;   // Length provided by user with setText(), which
44                                     //  may be -1.
45    UBool             fOwnsText;
46};
47
48static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
49
50RegularExpression::RegularExpression() {
51    fMagic        = REXP_MAGIC;
52    fPat          = NULL;
53    fPatRefCount  = NULL;
54    fPatString    = NULL;
55    fPatStringLen = 0;
56    fMatcher      = NULL;
57    fText         = NULL;
58    fTextLength   = 0;
59    fOwnsText     = FALSE;
60}
61
62RegularExpression::~RegularExpression() {
63    delete fMatcher;
64    fMatcher = NULL;
65    if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
66        delete fPat;
67        uprv_free(fPatString);
68        uprv_free((void *)fPatRefCount);
69    }
70    if (fOwnsText && fText!=NULL) {
71        uprv_free((void *)fText);
72    }
73    fMagic = 0;
74}
75
76U_NAMESPACE_END
77
78U_NAMESPACE_USE
79
80//----------------------------------------------------------------------------------------
81//
82//   validateRE    Do boilerplate style checks on API function parameters.
83//                 Return TRUE if they look OK.
84//----------------------------------------------------------------------------------------
85static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
86    if (U_FAILURE(*status)) {
87        return FALSE;
88    }
89    if (re == NULL || re->fMagic != REXP_MAGIC) {
90        *status = U_ILLEGAL_ARGUMENT_ERROR;
91        return FALSE;
92    }
93    // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94    if (requiresText && re->fText == NULL && !re->fOwnsText) {
95        *status = U_REGEX_INVALID_STATE;
96        return FALSE;
97    }
98    return TRUE;
99}
100
101//----------------------------------------------------------------------------------------
102//
103//    uregex_open
104//
105//----------------------------------------------------------------------------------------
106U_CAPI URegularExpression *  U_EXPORT2
107uregex_open( const  UChar          *pattern,
108                    int32_t         patternLength,
109                    uint32_t        flags,
110                    UParseError    *pe,
111                    UErrorCode     *status) {
112
113    if (U_FAILURE(*status)) {
114        return NULL;
115    }
116    if (pattern == NULL || patternLength < -1 || patternLength == 0) {
117        *status = U_ILLEGAL_ARGUMENT_ERROR;
118        return NULL;
119    }
120    int32_t actualPatLen = patternLength;
121    if (actualPatLen == -1) {
122        actualPatLen = u_strlen(pattern);
123    }
124
125    RegularExpression  *re     = new RegularExpression;
126    u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
127    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
128    if (re == NULL || refC == NULL || patBuf == NULL) {
129        *status = U_MEMORY_ALLOCATION_ERROR;
130        delete re;
131        uprv_free((void *)refC);
132        uprv_free(patBuf);
133        return NULL;
134    }
135    re->fPatRefCount = refC;
136    *re->fPatRefCount = 1;
137
138    //
139    // Make a copy of the pattern string, so we can return it later if asked.
140    //    For compiling the pattern, we will use a UText wrapper around
141    //    this local copy, to avoid making even more copies.
142    //
143    re->fPatString    = patBuf;
144    re->fPatStringLen = patternLength;
145    u_memcpy(patBuf, pattern, actualPatLen);
146    patBuf[actualPatLen] = 0;
147
148    UText patText = UTEXT_INITIALIZER;
149    utext_openUChars(&patText, patBuf, patternLength, status);
150
151    //
152    // Compile the pattern
153    //
154    if (pe != NULL) {
155        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
156    } else {
157        re->fPat = RegexPattern::compile(&patText, flags, *status);
158    }
159    utext_close(&patText);
160
161    if (U_FAILURE(*status)) {
162        goto ErrorExit;
163    }
164
165    //
166    // Create the matcher object
167    //
168    re->fMatcher = re->fPat->matcher(*status);
169    if (U_SUCCESS(*status)) {
170        return (URegularExpression*)re;
171    }
172
173ErrorExit:
174    delete re;
175    return NULL;
176
177}
178
179//----------------------------------------------------------------------------------------
180//
181//    uregex_openUText
182//
183//----------------------------------------------------------------------------------------
184U_CAPI URegularExpression *  U_EXPORT2
185uregex_openUText(UText          *pattern,
186                 uint32_t        flags,
187                 UParseError    *pe,
188                 UErrorCode     *status) {
189
190    if (U_FAILURE(*status)) {
191        return NULL;
192    }
193    if (pattern == NULL) {
194        *status = U_ILLEGAL_ARGUMENT_ERROR;
195        return NULL;
196    }
197
198    int64_t patternNativeLength = utext_nativeLength(pattern);
199
200    if (patternNativeLength == 0) {
201        *status = U_ILLEGAL_ARGUMENT_ERROR;
202        return NULL;
203    }
204
205    RegularExpression *re     = new RegularExpression;
206
207    UErrorCode lengthStatus = U_ZERO_ERROR;
208    int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
209
210    u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
211    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
212    if (re == NULL || refC == NULL || patBuf == NULL) {
213        *status = U_MEMORY_ALLOCATION_ERROR;
214        delete re;
215        uprv_free((void *)refC);
216        uprv_free(patBuf);
217        return NULL;
218    }
219    re->fPatRefCount = refC;
220    *re->fPatRefCount = 1;
221
222    //
223    // Make a copy of the pattern string, so we can return it later if asked.
224    //    For compiling the pattern, we will use a read-only UText wrapper
225    //    around this local copy, to avoid making even more copies.
226    //
227    re->fPatString    = patBuf;
228    re->fPatStringLen = pattern16Length;
229    utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
230
231    UText patText = UTEXT_INITIALIZER;
232    utext_openUChars(&patText, patBuf, pattern16Length, status);
233
234    //
235    // Compile the pattern
236    //
237    if (pe != NULL) {
238        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
239    } else {
240        re->fPat = RegexPattern::compile(&patText, flags, *status);
241    }
242    utext_close(&patText);
243
244    if (U_FAILURE(*status)) {
245        goto ErrorExit;
246    }
247
248    //
249    // Create the matcher object
250    //
251    re->fMatcher = re->fPat->matcher(*status);
252    if (U_SUCCESS(*status)) {
253        return (URegularExpression*)re;
254    }
255
256ErrorExit:
257    delete re;
258    return NULL;
259
260}
261
262//----------------------------------------------------------------------------------------
263//
264//    uregex_close
265//
266//----------------------------------------------------------------------------------------
267U_CAPI void  U_EXPORT2
268uregex_close(URegularExpression  *re2) {
269    RegularExpression *re = (RegularExpression*)re2;
270    UErrorCode  status = U_ZERO_ERROR;
271    if (validateRE(re, FALSE, &status) == FALSE) {
272        return;
273    }
274    delete re;
275}
276
277
278//----------------------------------------------------------------------------------------
279//
280//    uregex_clone
281//
282//----------------------------------------------------------------------------------------
283U_CAPI URegularExpression * U_EXPORT2
284uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
285    RegularExpression *source = (RegularExpression*)source2;
286    if (validateRE(source, FALSE, status) == FALSE) {
287        return NULL;
288    }
289
290    RegularExpression *clone = new RegularExpression;
291    if (clone == NULL) {
292        *status = U_MEMORY_ALLOCATION_ERROR;
293        return NULL;
294    }
295
296    clone->fMatcher = source->fPat->matcher(*status);
297    if (U_FAILURE(*status)) {
298        delete clone;
299        return NULL;
300    }
301
302    clone->fPat          = source->fPat;
303    clone->fPatRefCount  = source->fPatRefCount;
304    clone->fPatString    = source->fPatString;
305    clone->fPatStringLen = source->fPatStringLen;
306    umtx_atomic_inc(source->fPatRefCount);
307    // Note:  fText is not cloned.
308
309    return (URegularExpression*)clone;
310}
311
312
313
314
315//------------------------------------------------------------------------------
316//
317//    uregex_pattern
318//
319//------------------------------------------------------------------------------
320U_CAPI const UChar * U_EXPORT2
321uregex_pattern(const  URegularExpression *regexp2,
322                      int32_t            *patLength,
323                      UErrorCode         *status)  {
324    RegularExpression *regexp = (RegularExpression*)regexp2;
325
326    if (validateRE(regexp, FALSE, status) == FALSE) {
327        return NULL;
328    }
329    if (patLength != NULL) {
330        *patLength = regexp->fPatStringLen;
331    }
332    return regexp->fPatString;
333}
334
335
336//------------------------------------------------------------------------------
337//
338//    uregex_patternUText
339//
340//------------------------------------------------------------------------------
341U_CAPI UText * U_EXPORT2
342uregex_patternUText(const URegularExpression *regexp2,
343                          UErrorCode         *status)  {
344    RegularExpression *regexp = (RegularExpression*)regexp2;
345    return regexp->fPat->patternText(*status);
346}
347
348
349//------------------------------------------------------------------------------
350//
351//    uregex_flags
352//
353//------------------------------------------------------------------------------
354U_CAPI int32_t U_EXPORT2
355uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
356    RegularExpression *regexp = (RegularExpression*)regexp2;
357    if (validateRE(regexp, FALSE, status) == FALSE) {
358        return 0;
359    }
360    int32_t flags = regexp->fPat->flags();
361    return flags;
362}
363
364
365//------------------------------------------------------------------------------
366//
367//    uregex_setText
368//
369//------------------------------------------------------------------------------
370U_CAPI void U_EXPORT2
371uregex_setText(URegularExpression *regexp2,
372               const UChar        *text,
373               int32_t             textLength,
374               UErrorCode         *status)  {
375    RegularExpression *regexp = (RegularExpression*)regexp2;
376    if (validateRE(regexp, FALSE, status) == FALSE) {
377        return;
378    }
379    if (text == NULL || textLength < -1) {
380        *status = U_ILLEGAL_ARGUMENT_ERROR;
381        return;
382    }
383
384    if (regexp->fOwnsText && regexp->fText != NULL) {
385        uprv_free((void *)regexp->fText);
386    }
387
388    regexp->fText       = text;
389    regexp->fTextLength = textLength;
390    regexp->fOwnsText   = FALSE;
391
392    UText input = UTEXT_INITIALIZER;
393    utext_openUChars(&input, text, textLength, status);
394    regexp->fMatcher->reset(&input);
395    utext_close(&input); // reset() made a shallow clone, so we don't need this copy
396}
397
398
399//------------------------------------------------------------------------------
400//
401//    uregex_setUText
402//
403//------------------------------------------------------------------------------
404U_CAPI void U_EXPORT2
405uregex_setUText(URegularExpression *regexp2,
406                UText              *text,
407                UErrorCode         *status) {
408    RegularExpression *regexp = (RegularExpression*)regexp2;
409    if (validateRE(regexp, FALSE, status) == FALSE) {
410        return;
411    }
412    if (text == NULL) {
413        *status = U_ILLEGAL_ARGUMENT_ERROR;
414        return;
415    }
416
417    if (regexp->fOwnsText && regexp->fText != NULL) {
418        uprv_free((void *)regexp->fText);
419    }
420
421    regexp->fText       = NULL; // only fill it in on request
422    regexp->fTextLength = -1;
423    regexp->fOwnsText   = TRUE;
424    regexp->fMatcher->reset(text);
425}
426
427
428
429//------------------------------------------------------------------------------
430//
431//    uregex_getText
432//
433//------------------------------------------------------------------------------
434U_CAPI const UChar * U_EXPORT2
435uregex_getText(URegularExpression *regexp2,
436               int32_t            *textLength,
437               UErrorCode         *status)  {
438    RegularExpression *regexp = (RegularExpression*)regexp2;
439    if (validateRE(regexp, FALSE, status) == FALSE) {
440        return NULL;
441    }
442
443    if (regexp->fText == NULL) {
444        // need to fill in the text
445        UText *inputText = regexp->fMatcher->inputText();
446        int64_t inputNativeLength = utext_nativeLength(inputText);
447        if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
448            regexp->fText = inputText->chunkContents;
449            regexp->fTextLength = (int32_t)inputNativeLength;
450            regexp->fOwnsText = FALSE; // because the UText owns it
451        } else {
452            UErrorCode lengthStatus = U_ZERO_ERROR;
453            regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
454            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
455
456            utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
457            regexp->fText = inputChars;
458            regexp->fOwnsText = TRUE; // should already be set but just in case
459        }
460    }
461
462    if (textLength != NULL) {
463        *textLength = regexp->fTextLength;
464    }
465    return regexp->fText;
466}
467
468
469//------------------------------------------------------------------------------
470//
471//    uregex_getUText
472//
473//------------------------------------------------------------------------------
474U_CAPI UText * U_EXPORT2
475uregex_getUText(URegularExpression *regexp2,
476                UText              *dest,
477                UErrorCode         *status)  {
478    RegularExpression *regexp = (RegularExpression*)regexp2;
479    if (validateRE(regexp, FALSE, status) == FALSE) {
480        return dest;
481    }
482    return regexp->fMatcher->getInput(dest, *status);
483}
484
485
486//------------------------------------------------------------------------------
487//
488//    uregex_refreshUText
489//
490//------------------------------------------------------------------------------
491U_CAPI void U_EXPORT2
492uregex_refreshUText(URegularExpression *regexp2,
493                    UText              *text,
494                    UErrorCode         *status) {
495    RegularExpression *regexp = (RegularExpression*)regexp2;
496    if (validateRE(regexp, FALSE, status) == FALSE) {
497        return;
498    }
499    regexp->fMatcher->refreshInputText(text, *status);
500}
501
502
503//------------------------------------------------------------------------------
504//
505//    uregex_matches
506//
507//------------------------------------------------------------------------------
508U_CAPI UBool U_EXPORT2
509uregex_matches(URegularExpression *regexp2,
510               int32_t            startIndex,
511               UErrorCode        *status)  {
512    return uregex_matches64( regexp2, (int64_t)startIndex, status);
513}
514
515U_CAPI UBool U_EXPORT2
516uregex_matches64(URegularExpression *regexp2,
517                 int64_t            startIndex,
518                 UErrorCode        *status)  {
519    RegularExpression *regexp = (RegularExpression*)regexp2;
520    UBool result = FALSE;
521    if (validateRE(regexp, TRUE, status) == FALSE) {
522        return result;
523    }
524    if (startIndex == -1) {
525        result = regexp->fMatcher->matches(*status);
526    } else {
527        result = regexp->fMatcher->matches(startIndex, *status);
528    }
529    return result;
530}
531
532
533//------------------------------------------------------------------------------
534//
535//    uregex_lookingAt
536//
537//------------------------------------------------------------------------------
538U_CAPI UBool U_EXPORT2
539uregex_lookingAt(URegularExpression *regexp2,
540                 int32_t             startIndex,
541                 UErrorCode         *status)  {
542    return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
543}
544
545U_CAPI UBool U_EXPORT2
546uregex_lookingAt64(URegularExpression *regexp2,
547                   int64_t             startIndex,
548                   UErrorCode         *status)  {
549    RegularExpression *regexp = (RegularExpression*)regexp2;
550    UBool result = FALSE;
551    if (validateRE(regexp, TRUE, status) == FALSE) {
552        return result;
553    }
554    if (startIndex == -1) {
555        result = regexp->fMatcher->lookingAt(*status);
556    } else {
557        result = regexp->fMatcher->lookingAt(startIndex, *status);
558    }
559    return result;
560}
561
562
563
564//------------------------------------------------------------------------------
565//
566//    uregex_find
567//
568//------------------------------------------------------------------------------
569U_CAPI UBool U_EXPORT2
570uregex_find(URegularExpression *regexp2,
571            int32_t             startIndex,
572            UErrorCode         *status)  {
573    return uregex_find64( regexp2, (int64_t)startIndex, status);
574}
575
576U_CAPI UBool U_EXPORT2
577uregex_find64(URegularExpression *regexp2,
578              int64_t             startIndex,
579              UErrorCode         *status)  {
580    RegularExpression *regexp = (RegularExpression*)regexp2;
581    UBool result = FALSE;
582    if (validateRE(regexp, TRUE, status) == FALSE) {
583        return result;
584    }
585    if (startIndex == -1) {
586        regexp->fMatcher->resetPreserveRegion();
587        result = regexp->fMatcher->find();
588    } else {
589        result = regexp->fMatcher->find(startIndex, *status);
590    }
591    return result;
592}
593
594
595//------------------------------------------------------------------------------
596//
597//    uregex_findNext
598//
599//------------------------------------------------------------------------------
600U_CAPI UBool U_EXPORT2
601uregex_findNext(URegularExpression *regexp2,
602                UErrorCode         *status)  {
603    RegularExpression *regexp = (RegularExpression*)regexp2;
604    if (validateRE(regexp, TRUE, status) == FALSE) {
605        return FALSE;
606    }
607    UBool result = regexp->fMatcher->find();
608    return result;
609}
610
611//------------------------------------------------------------------------------
612//
613//    uregex_groupCount
614//
615//------------------------------------------------------------------------------
616U_CAPI int32_t U_EXPORT2
617uregex_groupCount(URegularExpression *regexp2,
618                  UErrorCode         *status)  {
619    RegularExpression *regexp = (RegularExpression*)regexp2;
620    if (validateRE(regexp, FALSE, status) == FALSE) {
621        return 0;
622    }
623    int32_t  result = regexp->fMatcher->groupCount();
624    return result;
625}
626
627
628//------------------------------------------------------------------------------
629//
630//    uregex_group
631//
632//------------------------------------------------------------------------------
633U_CAPI int32_t U_EXPORT2
634uregex_group(URegularExpression *regexp2,
635             int32_t             groupNum,
636             UChar              *dest,
637             int32_t             destCapacity,
638             UErrorCode          *status)  {
639    RegularExpression *regexp = (RegularExpression*)regexp2;
640    if (validateRE(regexp, TRUE, status) == FALSE) {
641        return 0;
642    }
643    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
644        *status = U_ILLEGAL_ARGUMENT_ERROR;
645        return 0;
646    }
647
648    if (destCapacity == 0 || regexp->fText != NULL) {
649        // If preflighting or if we already have the text as UChars,
650        // this is a little cheaper than going through uregex_groupUTextDeep()
651
652        //
653        // Pick up the range of characters from the matcher
654        //
655        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
656        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
657        if (U_FAILURE(*status)) {
658            return 0;
659        }
660
661        //
662        // Trim length based on buffer capacity
663        //
664        int32_t fullLength = endIx - startIx;
665        int32_t copyLength = fullLength;
666        if (copyLength < destCapacity) {
667            dest[copyLength] = 0;
668        } else if (copyLength == destCapacity) {
669            *status = U_STRING_NOT_TERMINATED_WARNING;
670        } else {
671            copyLength = destCapacity;
672            *status = U_BUFFER_OVERFLOW_ERROR;
673        }
674
675        //
676        // Copy capture group to user's buffer
677        //
678        if (copyLength > 0) {
679            u_memcpy(dest, &regexp->fText[startIx], copyLength);
680        }
681        return fullLength;
682    } else {
683        UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
684        int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
685        utext_close(groupText);
686        return result;
687    }
688}
689
690
691//------------------------------------------------------------------------------
692//
693//    uregex_groupUText
694//
695//------------------------------------------------------------------------------
696U_CAPI UText * U_EXPORT2
697uregex_groupUText(URegularExpression *regexp2,
698                  int32_t             groupNum,
699                  UText              *dest,
700                  int64_t            *groupLength,
701                  UErrorCode         *status)  {
702    RegularExpression *regexp = (RegularExpression*)regexp2;
703    if (validateRE(regexp, TRUE, status) == FALSE) {
704        UErrorCode emptyTextStatus = U_ZERO_ERROR;
705        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
706    }
707
708    return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
709}
710
711//------------------------------------------------------------------------------
712//
713//    uregex_groupUTextDeep
714//
715//------------------------------------------------------------------------------
716U_CAPI UText * U_EXPORT2
717uregex_groupUTextDeep(URegularExpression *regexp2,
718                  int32_t             groupNum,
719                  UText              *dest,
720                  UErrorCode         *status)  {
721    RegularExpression *regexp = (RegularExpression*)regexp2;
722    if (validateRE(regexp, TRUE, status) == FALSE) {
723        UErrorCode emptyTextStatus = U_ZERO_ERROR;
724        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
725    }
726
727    if (regexp->fText != NULL) {
728        //
729        // Pick up the range of characters from the matcher
730        // and use our already-extracted characters
731        //
732        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
733        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
734        if (U_FAILURE(*status)) {
735            UErrorCode emptyTextStatus = U_ZERO_ERROR;
736            return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
737        }
738
739        if (dest) {
740            utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
741        } else {
742            UText groupText = UTEXT_INITIALIZER;
743            utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
744            dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
745            utext_close(&groupText);
746        }
747
748        return dest;
749    } else {
750        return regexp->fMatcher->group(groupNum, dest, *status);
751    }
752}
753
754//------------------------------------------------------------------------------
755//
756//    uregex_start
757//
758//------------------------------------------------------------------------------
759U_CAPI int32_t U_EXPORT2
760uregex_start(URegularExpression *regexp2,
761             int32_t             groupNum,
762             UErrorCode          *status)  {
763    return (int32_t)uregex_start64( regexp2, groupNum, status);
764}
765
766U_CAPI int64_t U_EXPORT2
767uregex_start64(URegularExpression *regexp2,
768               int32_t             groupNum,
769               UErrorCode          *status)  {
770    RegularExpression *regexp = (RegularExpression*)regexp2;
771    if (validateRE(regexp, TRUE, status) == FALSE) {
772        return 0;
773    }
774    int32_t result = regexp->fMatcher->start(groupNum, *status);
775    return result;
776}
777
778//------------------------------------------------------------------------------
779//
780//    uregex_end
781//
782//------------------------------------------------------------------------------
783U_CAPI int32_t U_EXPORT2
784uregex_end(URegularExpression   *regexp2,
785           int32_t               groupNum,
786           UErrorCode           *status)  {
787    return (int32_t)uregex_end64( regexp2, groupNum, status);
788}
789
790U_CAPI int64_t U_EXPORT2
791uregex_end64(URegularExpression   *regexp2,
792             int32_t               groupNum,
793             UErrorCode           *status)  {
794    RegularExpression *regexp = (RegularExpression*)regexp2;
795    if (validateRE(regexp, TRUE, status) == FALSE) {
796        return 0;
797    }
798    int32_t result = regexp->fMatcher->end(groupNum, *status);
799    return result;
800}
801
802//------------------------------------------------------------------------------
803//
804//    uregex_reset
805//
806//------------------------------------------------------------------------------
807U_CAPI void U_EXPORT2
808uregex_reset(URegularExpression    *regexp2,
809             int32_t               index,
810             UErrorCode            *status)  {
811    uregex_reset64( regexp2, (int64_t)index, status);
812}
813
814U_CAPI void U_EXPORT2
815uregex_reset64(URegularExpression    *regexp2,
816               int64_t               index,
817               UErrorCode            *status)  {
818    RegularExpression *regexp = (RegularExpression*)regexp2;
819    if (validateRE(regexp, TRUE, status) == FALSE) {
820        return;
821    }
822    regexp->fMatcher->reset(index, *status);
823}
824
825
826//------------------------------------------------------------------------------
827//
828//    uregex_setRegion
829//
830//------------------------------------------------------------------------------
831U_CAPI void U_EXPORT2
832uregex_setRegion(URegularExpression   *regexp2,
833                 int32_t               regionStart,
834                 int32_t               regionLimit,
835                 UErrorCode           *status)  {
836    uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
837}
838
839U_CAPI void U_EXPORT2
840uregex_setRegion64(URegularExpression   *regexp2,
841                   int64_t               regionStart,
842                   int64_t               regionLimit,
843                   UErrorCode           *status)  {
844    RegularExpression *regexp = (RegularExpression*)regexp2;
845    if (validateRE(regexp, TRUE, status) == FALSE) {
846        return;
847    }
848    regexp->fMatcher->region(regionStart, regionLimit, *status);
849}
850
851
852//------------------------------------------------------------------------------
853//
854//    uregex_setRegionAndStart
855//
856//------------------------------------------------------------------------------
857U_CAPI void U_EXPORT2
858uregex_setRegionAndStart(URegularExpression   *regexp2,
859                 int64_t               regionStart,
860                 int64_t               regionLimit,
861                 int64_t               startIndex,
862                 UErrorCode           *status)  {
863    RegularExpression *regexp = (RegularExpression*)regexp2;
864    if (validateRE(regexp, TRUE, status) == FALSE) {
865        return;
866    }
867    regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
868}
869
870//------------------------------------------------------------------------------
871//
872//    uregex_regionStart
873//
874//------------------------------------------------------------------------------
875U_CAPI int32_t U_EXPORT2
876uregex_regionStart(const  URegularExpression   *regexp2,
877                          UErrorCode           *status)  {
878    return (int32_t)uregex_regionStart64(regexp2, status);
879}
880
881U_CAPI int64_t U_EXPORT2
882uregex_regionStart64(const  URegularExpression   *regexp2,
883                            UErrorCode           *status)  {
884    RegularExpression *regexp = (RegularExpression*)regexp2;
885    if (validateRE(regexp, TRUE, status) == FALSE) {
886        return 0;
887    }
888    return regexp->fMatcher->regionStart();
889}
890
891
892//------------------------------------------------------------------------------
893//
894//    uregex_regionEnd
895//
896//------------------------------------------------------------------------------
897U_CAPI int32_t U_EXPORT2
898uregex_regionEnd(const  URegularExpression   *regexp2,
899                        UErrorCode           *status)  {
900    return (int32_t)uregex_regionEnd64(regexp2, status);
901}
902
903U_CAPI int64_t U_EXPORT2
904uregex_regionEnd64(const  URegularExpression   *regexp2,
905                          UErrorCode           *status)  {
906    RegularExpression *regexp = (RegularExpression*)regexp2;
907    if (validateRE(regexp, TRUE, status) == FALSE) {
908        return 0;
909    }
910    return regexp->fMatcher->regionEnd();
911}
912
913
914//------------------------------------------------------------------------------
915//
916//    uregex_hasTransparentBounds
917//
918//------------------------------------------------------------------------------
919U_CAPI UBool U_EXPORT2
920uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
921                                   UErrorCode           *status)  {
922    RegularExpression *regexp = (RegularExpression*)regexp2;
923    if (validateRE(regexp, FALSE, status) == FALSE) {
924        return FALSE;
925    }
926    return regexp->fMatcher->hasTransparentBounds();
927}
928
929
930//------------------------------------------------------------------------------
931//
932//    uregex_useTransparentBounds
933//
934//------------------------------------------------------------------------------
935U_CAPI void U_EXPORT2
936uregex_useTransparentBounds(URegularExpression    *regexp2,
937                            UBool                  b,
938                            UErrorCode            *status)  {
939    RegularExpression *regexp = (RegularExpression*)regexp2;
940    if (validateRE(regexp, FALSE, status) == FALSE) {
941        return;
942    }
943    regexp->fMatcher->useTransparentBounds(b);
944}
945
946
947//------------------------------------------------------------------------------
948//
949//    uregex_hasAnchoringBounds
950//
951//------------------------------------------------------------------------------
952U_CAPI UBool U_EXPORT2
953uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
954                                 UErrorCode           *status)  {
955    RegularExpression *regexp = (RegularExpression*)regexp2;
956    if (validateRE(regexp, FALSE, status) == FALSE) {
957        return FALSE;
958    }
959    return regexp->fMatcher->hasAnchoringBounds();
960}
961
962
963//------------------------------------------------------------------------------
964//
965//    uregex_useAnchoringBounds
966//
967//------------------------------------------------------------------------------
968U_CAPI void U_EXPORT2
969uregex_useAnchoringBounds(URegularExpression    *regexp2,
970                          UBool                  b,
971                          UErrorCode            *status)  {
972    RegularExpression *regexp = (RegularExpression*)regexp2;
973    if (validateRE(regexp, FALSE, status) == FALSE) {
974        return;
975    }
976    regexp->fMatcher->useAnchoringBounds(b);
977}
978
979
980//------------------------------------------------------------------------------
981//
982//    uregex_hitEnd
983//
984//------------------------------------------------------------------------------
985U_CAPI UBool U_EXPORT2
986uregex_hitEnd(const  URegularExpression   *regexp2,
987                     UErrorCode           *status)  {
988    RegularExpression *regexp = (RegularExpression*)regexp2;
989    if (validateRE(regexp, TRUE, status) == FALSE) {
990        return FALSE;
991    }
992    return regexp->fMatcher->hitEnd();
993}
994
995
996//------------------------------------------------------------------------------
997//
998//    uregex_requireEnd
999//
1000//------------------------------------------------------------------------------
1001U_CAPI UBool U_EXPORT2
1002uregex_requireEnd(const  URegularExpression   *regexp2,
1003                         UErrorCode           *status)  {
1004    RegularExpression *regexp = (RegularExpression*)regexp2;
1005    if (validateRE(regexp, TRUE, status) == FALSE) {
1006        return FALSE;
1007    }
1008    return regexp->fMatcher->requireEnd();
1009}
1010
1011
1012//------------------------------------------------------------------------------
1013//
1014//    uregex_setTimeLimit
1015//
1016//------------------------------------------------------------------------------
1017U_CAPI void U_EXPORT2
1018uregex_setTimeLimit(URegularExpression   *regexp2,
1019                    int32_t               limit,
1020                    UErrorCode           *status) {
1021    RegularExpression *regexp = (RegularExpression*)regexp2;
1022    if (validateRE(regexp, FALSE, status)) {
1023        regexp->fMatcher->setTimeLimit(limit, *status);
1024    }
1025}
1026
1027
1028
1029//------------------------------------------------------------------------------
1030//
1031//    uregex_getTimeLimit
1032//
1033//------------------------------------------------------------------------------
1034U_CAPI int32_t U_EXPORT2
1035uregex_getTimeLimit(const  URegularExpression   *regexp2,
1036                           UErrorCode           *status) {
1037    int32_t retVal = 0;
1038    RegularExpression *regexp = (RegularExpression*)regexp2;
1039    if (validateRE(regexp, FALSE, status)) {
1040        retVal = regexp->fMatcher->getTimeLimit();
1041    }
1042    return retVal;
1043}
1044
1045
1046
1047//------------------------------------------------------------------------------
1048//
1049//    uregex_setStackLimit
1050//
1051//------------------------------------------------------------------------------
1052U_CAPI void U_EXPORT2
1053uregex_setStackLimit(URegularExpression   *regexp2,
1054                     int32_t               limit,
1055                     UErrorCode           *status) {
1056    RegularExpression *regexp = (RegularExpression*)regexp2;
1057    if (validateRE(regexp, FALSE, status)) {
1058        regexp->fMatcher->setStackLimit(limit, *status);
1059    }
1060}
1061
1062
1063
1064//------------------------------------------------------------------------------
1065//
1066//    uregex_getStackLimit
1067//
1068//------------------------------------------------------------------------------
1069U_CAPI int32_t U_EXPORT2
1070uregex_getStackLimit(const  URegularExpression   *regexp2,
1071                            UErrorCode           *status) {
1072    int32_t retVal = 0;
1073    RegularExpression *regexp = (RegularExpression*)regexp2;
1074    if (validateRE(regexp, FALSE, status)) {
1075        retVal = regexp->fMatcher->getStackLimit();
1076    }
1077    return retVal;
1078}
1079
1080
1081//------------------------------------------------------------------------------
1082//
1083//    uregex_setMatchCallback
1084//
1085//------------------------------------------------------------------------------
1086U_CAPI void U_EXPORT2
1087uregex_setMatchCallback(URegularExpression      *regexp2,
1088                        URegexMatchCallback     *callback,
1089                        const void              *context,
1090                        UErrorCode              *status) {
1091    RegularExpression *regexp = (RegularExpression*)regexp2;
1092    if (validateRE(regexp, FALSE, status)) {
1093        regexp->fMatcher->setMatchCallback(callback, context, *status);
1094    }
1095}
1096
1097
1098//------------------------------------------------------------------------------
1099//
1100//    uregex_getMatchCallback
1101//
1102//------------------------------------------------------------------------------
1103U_CAPI void U_EXPORT2
1104uregex_getMatchCallback(const URegularExpression    *regexp2,
1105                        URegexMatchCallback        **callback,
1106                        const void                 **context,
1107                        UErrorCode                  *status) {
1108    RegularExpression *regexp = (RegularExpression*)regexp2;
1109     if (validateRE(regexp, FALSE, status)) {
1110         regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1111     }
1112}
1113
1114
1115//------------------------------------------------------------------------------
1116//
1117//    uregex_setMatchProgressCallback
1118//
1119//------------------------------------------------------------------------------
1120U_CAPI void U_EXPORT2
1121uregex_setFindProgressCallback(URegularExpression              *regexp2,
1122                                URegexFindProgressCallback      *callback,
1123                                const void                      *context,
1124                                UErrorCode                      *status) {
1125    RegularExpression *regexp = (RegularExpression*)regexp2;
1126    if (validateRE(regexp, FALSE, status)) {
1127        regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1128    }
1129}
1130
1131
1132//------------------------------------------------------------------------------
1133//
1134//    uregex_getMatchCallback
1135//
1136//------------------------------------------------------------------------------
1137U_CAPI void U_EXPORT2
1138uregex_getFindProgressCallback(const URegularExpression          *regexp2,
1139                                URegexFindProgressCallback        **callback,
1140                                const void                        **context,
1141                                UErrorCode                        *status) {
1142    RegularExpression *regexp = (RegularExpression*)regexp2;
1143     if (validateRE(regexp, FALSE, status)) {
1144         regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1145     }
1146}
1147
1148
1149//------------------------------------------------------------------------------
1150//
1151//    uregex_replaceAll
1152//
1153//------------------------------------------------------------------------------
1154U_CAPI int32_t U_EXPORT2
1155uregex_replaceAll(URegularExpression    *regexp2,
1156                  const UChar           *replacementText,
1157                  int32_t                replacementLength,
1158                  UChar                 *destBuf,
1159                  int32_t                destCapacity,
1160                  UErrorCode            *status)  {
1161    RegularExpression *regexp = (RegularExpression*)regexp2;
1162    if (validateRE(regexp, TRUE, status) == FALSE) {
1163        return 0;
1164    }
1165    if (replacementText == NULL || replacementLength < -1 ||
1166        (destBuf == NULL && destCapacity > 0) ||
1167        destCapacity < 0) {
1168        *status = U_ILLEGAL_ARGUMENT_ERROR;
1169        return 0;
1170    }
1171
1172    int32_t   len = 0;
1173
1174    uregex_reset(regexp2, 0, status);
1175
1176    // Note: Seperate error code variables for findNext() and appendReplacement()
1177    //       are used so that destination buffer overflow errors
1178    //       in appendReplacement won't stop findNext() from working.
1179    //       appendReplacement() and appendTail() special case incoming buffer
1180    //       overflow errors, continuing to return the correct length.
1181    UErrorCode  findStatus = *status;
1182    while (uregex_findNext(regexp2, &findStatus)) {
1183        len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1184                                        &destBuf, &destCapacity, status);
1185    }
1186    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1187
1188    if (U_FAILURE(findStatus)) {
1189        // If anything went wrong with the findNext(), make that error trump
1190        //   whatever may have happened with the append() operations.
1191        //   Errors in findNext() are not expected.
1192        *status = findStatus;
1193    }
1194
1195    return len;
1196}
1197
1198
1199//------------------------------------------------------------------------------
1200//
1201//    uregex_replaceAllUText
1202//
1203//------------------------------------------------------------------------------
1204U_CAPI UText * U_EXPORT2
1205uregex_replaceAllUText(URegularExpression    *regexp2,
1206                       UText                 *replacementText,
1207                       UText                 *dest,
1208                       UErrorCode            *status)  {
1209    RegularExpression *regexp = (RegularExpression*)regexp2;
1210    if (validateRE(regexp, TRUE, status) == FALSE) {
1211        return 0;
1212    }
1213    if (replacementText == NULL) {
1214        *status = U_ILLEGAL_ARGUMENT_ERROR;
1215        return 0;
1216    }
1217
1218    dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1219    return dest;
1220}
1221
1222
1223//------------------------------------------------------------------------------
1224//
1225//    uregex_replaceFirst
1226//
1227//------------------------------------------------------------------------------
1228U_CAPI int32_t U_EXPORT2
1229uregex_replaceFirst(URegularExpression  *regexp2,
1230                    const UChar         *replacementText,
1231                    int32_t              replacementLength,
1232                    UChar               *destBuf,
1233                    int32_t              destCapacity,
1234                    UErrorCode          *status)  {
1235    RegularExpression *regexp = (RegularExpression*)regexp2;
1236    if (validateRE(regexp, TRUE, status) == FALSE) {
1237        return 0;
1238    }
1239    if (replacementText == NULL || replacementLength < -1 ||
1240        (destBuf == NULL && destCapacity > 0) ||
1241        destCapacity < 0) {
1242        *status = U_ILLEGAL_ARGUMENT_ERROR;
1243        return 0;
1244    }
1245
1246    int32_t   len = 0;
1247    UBool     findSucceeded;
1248    uregex_reset(regexp2, 0, status);
1249    findSucceeded = uregex_find(regexp2, 0, status);
1250    if (findSucceeded) {
1251        len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1252                                       &destBuf, &destCapacity, status);
1253    }
1254    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1255
1256    return len;
1257}
1258
1259
1260//------------------------------------------------------------------------------
1261//
1262//    uregex_replaceFirstUText
1263//
1264//------------------------------------------------------------------------------
1265U_CAPI UText * U_EXPORT2
1266uregex_replaceFirstUText(URegularExpression  *regexp2,
1267                         UText                 *replacementText,
1268                         UText                 *dest,
1269                         UErrorCode            *status)  {
1270    RegularExpression *regexp = (RegularExpression*)regexp2;
1271    if (validateRE(regexp, TRUE, status) == FALSE) {
1272        return 0;
1273    }
1274    if (replacementText == NULL) {
1275        *status = U_ILLEGAL_ARGUMENT_ERROR;
1276        return 0;
1277    }
1278
1279    dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1280    return dest;
1281}
1282
1283
1284//------------------------------------------------------------------------------
1285//
1286//    uregex_appendReplacement
1287//
1288//------------------------------------------------------------------------------
1289
1290U_NAMESPACE_BEGIN
1291//
1292//  Dummy class, because these functions need to be friends of class RegexMatcher,
1293//               and stand-alone C functions don't work as friends
1294//
1295class RegexCImpl {
1296 public:
1297   inline static  int32_t appendReplacement(RegularExpression    *regexp,
1298                      const UChar           *replacementText,
1299                      int32_t                replacementLength,
1300                      UChar                **destBuf,
1301                      int32_t               *destCapacity,
1302                      UErrorCode            *status);
1303
1304   inline static int32_t appendTail(RegularExpression    *regexp,
1305        UChar                **destBuf,
1306        int32_t               *destCapacity,
1307        UErrorCode            *status);
1308
1309    inline static int32_t split(RegularExpression    *regexp,
1310        UChar                 *destBuf,
1311        int32_t                destCapacity,
1312        int32_t               *requiredCapacity,
1313        UChar                 *destFields[],
1314        int32_t                destFieldsCapacity,
1315        UErrorCode            *status);
1316};
1317
1318U_NAMESPACE_END
1319
1320
1321
1322static const UChar BACKSLASH  = 0x5c;
1323static const UChar DOLLARSIGN = 0x24;
1324
1325//
1326//  Move a character to an output buffer, with bounds checking on the index.
1327//      Index advances even if capacity is exceeded, for preflight size computations.
1328//      This little sequence is used a LOT.
1329//
1330static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1331    if (*idx < bufCapacity) {
1332        buf[*idx] = c;
1333    }
1334    (*idx)++;
1335}
1336
1337
1338//
1339//  appendReplacement, the actual implementation.
1340//
1341int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
1342                                      const UChar           *replacementText,
1343                                      int32_t                replacementLength,
1344                                      UChar                **destBuf,
1345                                      int32_t               *destCapacity,
1346                                      UErrorCode            *status)  {
1347
1348    // If we come in with a buffer overflow error, don't suppress the operation.
1349    //  A series of appendReplacements, appendTail need to correctly preflight
1350    //  the buffer size when an overflow happens somewhere in the middle.
1351    UBool pendingBufferOverflow = FALSE;
1352    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1353        pendingBufferOverflow = TRUE;
1354        *status = U_ZERO_ERROR;
1355    }
1356
1357    //
1358    // Validate all paramters
1359    //
1360    if (validateRE(regexp, TRUE, status) == FALSE) {
1361        return 0;
1362    }
1363    if (replacementText == NULL || replacementLength < -1 ||
1364        destCapacity == NULL || destBuf == NULL ||
1365        (*destBuf == NULL && *destCapacity > 0) ||
1366        *destCapacity < 0) {
1367        *status = U_ILLEGAL_ARGUMENT_ERROR;
1368        return 0;
1369    }
1370
1371    RegexMatcher *m = regexp->fMatcher;
1372    if (m->fMatch == FALSE) {
1373        *status = U_REGEX_INVALID_STATE;
1374        return 0;
1375    }
1376
1377    UChar    *dest             = *destBuf;
1378    int32_t   capacity         = *destCapacity;
1379    int32_t   destIdx          =  0;
1380    int32_t   i;
1381
1382    // If it wasn't supplied by the caller,  get the length of the replacement text.
1383    //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
1384    //          the fly and avoid this step.
1385    if (replacementLength == -1) {
1386        replacementLength = u_strlen(replacementText);
1387    }
1388
1389    // Copy input string from the end of previous match to start of current match
1390    if (regexp->fText != NULL) {
1391        int32_t matchStart;
1392        int32_t lastMatchEnd;
1393        if (UTEXT_USES_U16(m->fInputText)) {
1394            lastMatchEnd = (int32_t)m->fLastMatchEnd;
1395            matchStart = (int32_t)m->fMatchStart;
1396        } else {
1397            // !!!: Would like a better way to do this!
1398            UErrorCode status = U_ZERO_ERROR;
1399            lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
1400            status = U_ZERO_ERROR;
1401            matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
1402        }
1403        for (i=lastMatchEnd; i<matchStart; i++) {
1404            appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1405        }
1406    } else {
1407        UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1408        destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1409                                 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
1410                                 &possibleOverflowError);
1411    }
1412    U_ASSERT(destIdx >= 0);
1413
1414    // scan the replacement text, looking for substitutions ($n) and \escapes.
1415    int32_t  replIdx = 0;
1416    while (replIdx < replacementLength) {
1417        UChar  c = replacementText[replIdx];
1418        replIdx++;
1419        if (c != DOLLARSIGN && c != BACKSLASH) {
1420            // Common case, no substitution, no escaping,
1421            //  just copy the char to the dest buf.
1422            appendToBuf(c, &destIdx, dest, capacity);
1423            continue;
1424        }
1425
1426        if (c == BACKSLASH) {
1427            // Backslash Escape.  Copy the following char out without further checks.
1428            //                    Note:  Surrogate pairs don't need any special handling
1429            //                           The second half wont be a '$' or a '\', and
1430            //                           will move to the dest normally on the next
1431            //                           loop iteration.
1432            if (replIdx >= replacementLength) {
1433                break;
1434            }
1435            c = replacementText[replIdx];
1436
1437            if (c==0x55/*U*/ || c==0x75/*u*/) {
1438                // We have a \udddd or \Udddddddd escape sequence.
1439                UChar32 escapedChar =
1440                    u_unescapeAt(uregex_ucstr_unescape_charAt,
1441                       &replIdx,                   // Index is updated by unescapeAt
1442                       replacementLength,          // Length of replacement text
1443                       (void *)replacementText);
1444
1445                if (escapedChar != (UChar32)0xFFFFFFFF) {
1446                    if (escapedChar <= 0xffff) {
1447                        appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1448                    } else {
1449                        appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1450                        appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1451                    }
1452                    continue;
1453                }
1454                // Note:  if the \u escape was invalid, just fall through and
1455                //        treat it as a plain \<anything> escape.
1456            }
1457
1458            // Plain backslash escape.  Just put out the escaped character.
1459            appendToBuf(c, &destIdx, dest, capacity);
1460
1461            replIdx++;
1462            continue;
1463        }
1464
1465
1466
1467        // We've got a $.  Pick up a capture group number if one follows.
1468        // Consume at most the number of digits necessary for the largest capture
1469        // number that is valid for this pattern.
1470
1471        int32_t numDigits = 0;
1472        int32_t groupNum  = 0;
1473        UChar32 digitC;
1474        for (;;) {
1475            if (replIdx >= replacementLength) {
1476                break;
1477            }
1478            U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
1479            if (u_isdigit(digitC) == FALSE) {
1480                break;
1481            }
1482
1483            U16_FWD_1(replacementText, replIdx, replacementLength);
1484            groupNum=groupNum*10 + u_charDigitValue(digitC);
1485            numDigits++;
1486            if (numDigits >= m->fPattern->fMaxCaptureDigits) {
1487                break;
1488            }
1489        }
1490
1491
1492        if (numDigits == 0) {
1493            // The $ didn't introduce a group number at all.
1494            // Treat it as just part of the substitution text.
1495            appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
1496            continue;
1497        }
1498
1499        // Finally, append the capture group data to the destination.
1500        destIdx += uregex_group((URegularExpression*)regexp, groupNum,
1501                                dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1502        if (*status == U_BUFFER_OVERFLOW_ERROR) {
1503            // Ignore buffer overflow when extracting the group.  We need to
1504            //   continue on to get full size of the untruncated result.  We will
1505            //   raise our own buffer overflow error at the end.
1506            *status = U_ZERO_ERROR;
1507        }
1508
1509        if (U_FAILURE(*status)) {
1510            // Can fail if group number is out of range.
1511            break;
1512        }
1513
1514    }
1515
1516    //
1517    //  Nul Terminate the dest buffer if possible.
1518    //  Set the appropriate buffer overflow or not terminated error, if needed.
1519    //
1520    if (destIdx < capacity) {
1521        dest[destIdx] = 0;
1522    } else if (destIdx == *destCapacity) {
1523        *status = U_STRING_NOT_TERMINATED_WARNING;
1524    } else {
1525        *status = U_BUFFER_OVERFLOW_ERROR;
1526    }
1527
1528    //
1529    // Return an updated dest buffer and capacity to the caller.
1530    //
1531    if (destIdx > 0 &&  *destCapacity > 0) {
1532        if (destIdx < capacity) {
1533            *destBuf      += destIdx;
1534            *destCapacity -= destIdx;
1535        } else {
1536            *destBuf      += capacity;
1537            *destCapacity =  0;
1538        }
1539    }
1540
1541    // If we came in with a buffer overflow, make sure we go out with one also.
1542    //   (A zero length match right at the end of the previous match could
1543    //    make this function succeed even though a previous call had overflowed the buf)
1544    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1545        *status = U_BUFFER_OVERFLOW_ERROR;
1546    }
1547
1548    return destIdx;
1549}
1550
1551//
1552//   appendReplacement   the actual API function,
1553//
1554U_CAPI int32_t U_EXPORT2
1555uregex_appendReplacement(URegularExpression    *regexp2,
1556                         const UChar           *replacementText,
1557                         int32_t                replacementLength,
1558                         UChar                **destBuf,
1559                         int32_t               *destCapacity,
1560                         UErrorCode            *status) {
1561
1562    RegularExpression *regexp = (RegularExpression*)regexp2;
1563    return RegexCImpl::appendReplacement(
1564        regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1565}
1566
1567//
1568//   uregex_appendReplacementUText...can just use the normal C++ method
1569//
1570U_CAPI void U_EXPORT2
1571uregex_appendReplacementUText(URegularExpression    *regexp2,
1572                              UText                 *replText,
1573                              UText                 *dest,
1574                              UErrorCode            *status)  {
1575    RegularExpression *regexp = (RegularExpression*)regexp2;
1576    regexp->fMatcher->appendReplacement(dest, replText, *status);
1577}
1578
1579
1580//------------------------------------------------------------------------------
1581//
1582//    uregex_appendTail
1583//
1584//------------------------------------------------------------------------------
1585int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
1586                               UChar                **destBuf,
1587                               int32_t               *destCapacity,
1588                               UErrorCode            *status)
1589{
1590
1591    // If we come in with a buffer overflow error, don't suppress the operation.
1592    //  A series of appendReplacements, appendTail need to correctly preflight
1593    //  the buffer size when an overflow happens somewhere in the middle.
1594    UBool pendingBufferOverflow = FALSE;
1595    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1596        pendingBufferOverflow = TRUE;
1597        *status = U_ZERO_ERROR;
1598    }
1599
1600    if (validateRE(regexp, TRUE, status) == FALSE) {
1601        return 0;
1602    }
1603
1604    if (destCapacity == NULL || destBuf == NULL ||
1605        (*destBuf == NULL && *destCapacity > 0) ||
1606        *destCapacity < 0)
1607    {
1608        *status = U_ILLEGAL_ARGUMENT_ERROR;
1609        return 0;
1610    }
1611
1612    RegexMatcher *m = regexp->fMatcher;
1613
1614    int32_t  destIdx     = 0;
1615    int32_t  destCap     = *destCapacity;
1616    UChar    *dest       = *destBuf;
1617
1618    if (regexp->fText != NULL) {
1619        int32_t srcIdx;
1620        int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1621        if (nativeIdx == -1) {
1622            srcIdx = 0;
1623        } else if (UTEXT_USES_U16(m->fInputText)) {
1624            srcIdx = (int32_t)nativeIdx;
1625        } else {
1626            UErrorCode status = U_ZERO_ERROR;
1627            srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1628        }
1629
1630        for (;;) {
1631            U_ASSERT(destIdx >= 0);
1632
1633            if (srcIdx == regexp->fTextLength) {
1634                break;
1635            }
1636            UChar c = regexp->fText[srcIdx];
1637            if (c == 0 && regexp->fTextLength == -1) {
1638                regexp->fTextLength = srcIdx;
1639                break;
1640            }
1641
1642            if (destIdx < destCap) {
1643                dest[destIdx] = c;
1644            } else {
1645                // We've overflowed the dest buffer.
1646                //  If the total input string length is known, we can
1647                //    compute the total buffer size needed without scanning through the string.
1648                if (regexp->fTextLength > 0) {
1649                    destIdx += (regexp->fTextLength - srcIdx);
1650                    break;
1651                }
1652            }
1653            srcIdx++;
1654            destIdx++;
1655        }
1656    } else {
1657        int64_t  srcIdx;
1658        if (m->fMatch) {
1659            // The most recent call to find() succeeded.
1660            srcIdx = m->fMatchEnd;
1661        } else {
1662            // The last call to find() on this matcher failed().
1663            //   Look back to the end of the last find() that succeeded for src index.
1664            srcIdx = m->fLastMatchEnd;
1665            if (srcIdx == -1)  {
1666                // There has been no successful match with this matcher.
1667                //   We want to copy the whole string.
1668                srcIdx = 0;
1669            }
1670        }
1671
1672        destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1673    }
1674
1675    //
1676    //  NUL terminate the output string, if possible, otherwise issue the
1677    //   appropriate error or warning.
1678    //
1679    if (destIdx < destCap) {
1680        dest[destIdx] = 0;
1681    } else  if (destIdx == destCap) {
1682        *status = U_STRING_NOT_TERMINATED_WARNING;
1683    } else {
1684        *status = U_BUFFER_OVERFLOW_ERROR;
1685    }
1686
1687    //
1688    // Update the user's buffer ptr and capacity vars to reflect the
1689    //   amount used.
1690    //
1691    if (destIdx < destCap) {
1692        *destBuf      += destIdx;
1693        *destCapacity -= destIdx;
1694    } else if (*destBuf != NULL) {
1695        *destBuf      += destCap;
1696        *destCapacity  = 0;
1697    }
1698
1699    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1700        *status = U_BUFFER_OVERFLOW_ERROR;
1701    }
1702
1703    return destIdx;
1704}
1705
1706
1707//
1708//   appendTail   the actual API function
1709//
1710U_CAPI int32_t U_EXPORT2
1711uregex_appendTail(URegularExpression    *regexp2,
1712                  UChar                **destBuf,
1713                  int32_t               *destCapacity,
1714                  UErrorCode            *status)  {
1715    RegularExpression *regexp = (RegularExpression*)regexp2;
1716    return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1717}
1718
1719
1720//
1721//   uregex_appendTailUText...can just use the normal C++ method
1722//
1723U_CAPI UText * U_EXPORT2
1724uregex_appendTailUText(URegularExpression    *regexp2,
1725                       UText                 *dest,
1726                       UErrorCode            *status)  {
1727    RegularExpression *regexp = (RegularExpression*)regexp2;
1728    return regexp->fMatcher->appendTail(dest, *status);
1729}
1730
1731
1732//------------------------------------------------------------------------------
1733//
1734//    copyString     Internal utility to copy a string to an output buffer,
1735//                   while managing buffer overflow and preflight size
1736//                   computation.  NUL termination is added to destination,
1737//                   and the NUL is counted in the output size.
1738//
1739//------------------------------------------------------------------------------
1740#if 0
1741static void copyString(UChar        *destBuffer,    //  Destination buffer.
1742                       int32_t       destCapacity,  //  Total capacity of dest buffer
1743                       int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
1744                                                    //    Update not clipped to destCapacity.
1745                       const UChar  *srcPtr,        //  Pointer to source string
1746                       int32_t       srcLen)        //  Source string len.
1747{
1748    int32_t  si;
1749    int32_t  di = *destIndex;
1750    UChar    c;
1751
1752    for (si=0; si<srcLen;  si++) {
1753        c = srcPtr[si];
1754        if (di < destCapacity) {
1755            destBuffer[di] = c;
1756            di++;
1757        } else {
1758            di += srcLen - si;
1759            break;
1760        }
1761    }
1762    if (di<destCapacity) {
1763        destBuffer[di] = 0;
1764    }
1765    di++;
1766    *destIndex = di;
1767}
1768#endif
1769
1770//------------------------------------------------------------------------------
1771//
1772//    uregex_split
1773//
1774//------------------------------------------------------------------------------
1775int32_t RegexCImpl::split(RegularExpression     *regexp,
1776                          UChar                 *destBuf,
1777                          int32_t                destCapacity,
1778                          int32_t               *requiredCapacity,
1779                          UChar                 *destFields[],
1780                          int32_t                destFieldsCapacity,
1781                          UErrorCode            *status) {
1782    //
1783    // Reset for the input text
1784    //
1785    regexp->fMatcher->reset();
1786    UText *inputText = regexp->fMatcher->fInputText;
1787    int64_t   nextOutputStringStart = 0;
1788    int64_t   inputLen = regexp->fMatcher->fInputLength;
1789    if (inputLen == 0) {
1790        return 0;
1791    }
1792
1793    //
1794    // Loop through the input text, searching for the delimiter pattern
1795    //
1796    int32_t   i;             // Index of the field being processed.
1797    int32_t   destIdx = 0;   // Next available position in destBuf;
1798    int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
1799    UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
1800    for (i=0; ; i++) {
1801        if (i>=destFieldsCapacity-1) {
1802            // There are one or zero output strings left.
1803            // Fill the last output string with whatever is left from the input, then exit the loop.
1804            //  ( i will be == destFieldsCapacity if we filled the output array while processing
1805            //    capture groups of the delimiter expression, in which case we will discard the
1806            //    last capture group saved in favor of the unprocessed remainder of the
1807            //    input string.)
1808            if (inputLen > nextOutputStringStart) {
1809                if (i != destFieldsCapacity-1) {
1810                    // No fields are left.  Recycle the last one for holding the trailing part of
1811                    //   the input string.
1812                    i = destFieldsCapacity-1;
1813                    destIdx = (int32_t)(destFields[i] - destFields[0]);
1814                }
1815
1816                destFields[i] = &destBuf[destIdx];
1817                destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1818                                             &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1819            }
1820            break;
1821        }
1822
1823        if (regexp->fMatcher->find()) {
1824            // We found another delimiter.  Move everything from where we started looking
1825            //  up until the start of the delimiter into the next output string.
1826            destFields[i] = &destBuf[destIdx];
1827
1828            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1829                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1830            if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1831                tStatus = U_ZERO_ERROR;
1832            } else {
1833                *status = tStatus;
1834            }
1835            nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1836
1837            // If the delimiter pattern has capturing parentheses, the captured
1838            //  text goes out into the next n destination strings.
1839            int32_t groupNum;
1840            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1841                // If we've run out of output string slots, bail out.
1842                if (i==destFieldsCapacity-1) {
1843                    break;
1844                }
1845                i++;
1846
1847                // Set up to extract the capture group contents into the dest buffer.
1848                destFields[i] = &destBuf[destIdx];
1849                tStatus = U_ZERO_ERROR;
1850                int32_t t = uregex_group((URegularExpression*)regexp,
1851                                         groupNum,
1852                                         destFields[i],
1853                                         REMAINING_CAPACITY(destIdx, destCapacity),
1854                                         &tStatus);
1855                destIdx += t + 1;    // Record the space used in the output string buffer.
1856                                     //  +1 for the NUL that terminates the string.
1857                if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1858                    tStatus = U_ZERO_ERROR;
1859                } else {
1860                    *status = tStatus;
1861                }
1862            }
1863
1864            if (nextOutputStringStart == inputLen) {
1865                // The delimiter was at the end of the string.
1866                // Output an empty string, and then we are done.
1867                if (destIdx < destCapacity) {
1868                    destBuf[destIdx] = 0;
1869                }
1870                if (i < destFieldsCapacity-1) {
1871                   ++i;
1872                }
1873                if (destIdx < destCapacity) {
1874                    destFields[i] = destBuf + destIdx;
1875                }
1876                ++destIdx;
1877                break;
1878            }
1879
1880        }
1881        else
1882        {
1883            // We ran off the end of the input while looking for the next delimiter.
1884            // All the remaining text goes into the current output string.
1885            destFields[i] = &destBuf[destIdx];
1886            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1887                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1888            break;
1889        }
1890    }
1891
1892    // Zero out any unused portion of the destFields array
1893    int j;
1894    for (j=i+1; j<destFieldsCapacity; j++) {
1895        destFields[j] = NULL;
1896    }
1897
1898    if (requiredCapacity != NULL) {
1899        *requiredCapacity = destIdx;
1900    }
1901    if (destIdx > destCapacity) {
1902        *status = U_BUFFER_OVERFLOW_ERROR;
1903    }
1904    return i+1;
1905}
1906
1907//
1908//   uregex_split   The actual API function
1909//
1910U_CAPI int32_t U_EXPORT2
1911uregex_split(URegularExpression      *regexp2,
1912             UChar                   *destBuf,
1913             int32_t                  destCapacity,
1914             int32_t                 *requiredCapacity,
1915             UChar                   *destFields[],
1916             int32_t                  destFieldsCapacity,
1917             UErrorCode              *status) {
1918    RegularExpression *regexp = (RegularExpression*)regexp2;
1919    if (validateRE(regexp, TRUE, status) == FALSE) {
1920        return 0;
1921    }
1922    if ((destBuf == NULL && destCapacity > 0) ||
1923        destCapacity < 0 ||
1924        destFields == NULL ||
1925        destFieldsCapacity < 1 ) {
1926        *status = U_ILLEGAL_ARGUMENT_ERROR;
1927        return 0;
1928    }
1929
1930    return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1931}
1932
1933
1934//
1935//   uregex_splitUText...can just use the normal C++ method
1936//
1937U_CAPI int32_t U_EXPORT2
1938uregex_splitUText(URegularExpression    *regexp2,
1939                  UText                 *destFields[],
1940                  int32_t                destFieldsCapacity,
1941                  UErrorCode            *status) {
1942    RegularExpression *regexp = (RegularExpression*)regexp2;
1943    return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1944}
1945
1946
1947#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1948
1949