1/*
2*******************************************************************************
3*   Copyright (C) 2004-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  regex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "umutex.h"
20#include "uassert.h"
21#include "cmemory.h"
22
23#include "regextxt.h"
24
25#include <stdio.h>
26
27U_NAMESPACE_BEGIN
28
29#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
30
31struct RegularExpression: public UMemory {
32public:
33    RegularExpression();
34    ~RegularExpression();
35    int32_t           fMagic;
36    RegexPattern     *fPat;
37    int32_t          *fPatRefCount;
38    UChar            *fPatString;
39    int32_t           fPatStringLen;
40    RegexMatcher     *fMatcher;
41    const UChar      *fText;         // Text from setText()
42    int32_t           fTextLength;   // Length provided by user with setText(), which
43                                     //  may be -1.
44    UBool             fOwnsText;
45};
46
47static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
48
49RegularExpression::RegularExpression() {
50    fMagic        = REXP_MAGIC;
51    fPat          = NULL;
52    fPatRefCount  = NULL;
53    fPatString    = NULL;
54    fPatStringLen = 0;
55    fMatcher      = NULL;
56    fText         = NULL;
57    fTextLength   = 0;
58    fOwnsText     = FALSE;
59}
60
61RegularExpression::~RegularExpression() {
62    delete fMatcher;
63    fMatcher = NULL;
64    if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
65        delete fPat;
66        uprv_free(fPatString);
67        uprv_free(fPatRefCount);
68    }
69    if (fOwnsText && fText!=NULL) {
70        uprv_free((void *)fText);
71    }
72    fMagic = 0;
73}
74
75U_NAMESPACE_END
76
77U_NAMESPACE_USE
78
79//----------------------------------------------------------------------------------------
80//
81//   validateRE    Do boilerplate style checks on API function parameters.
82//                 Return TRUE if they look OK.
83//----------------------------------------------------------------------------------------
84static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
85    if (U_FAILURE(*status)) {
86        return FALSE;
87    }
88    if (re == NULL || re->fMagic != REXP_MAGIC) {
89        *status = U_ILLEGAL_ARGUMENT_ERROR;
90        return FALSE;
91    }
92    // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
93    if (requiresText && re->fText == NULL && !re->fOwnsText) {
94        *status = U_REGEX_INVALID_STATE;
95        return FALSE;
96    }
97    return TRUE;
98}
99
100//----------------------------------------------------------------------------------------
101//
102//    uregex_open
103//
104//----------------------------------------------------------------------------------------
105U_CAPI URegularExpression *  U_EXPORT2
106uregex_open( const  UChar          *pattern,
107                    int32_t         patternLength,
108                    uint32_t        flags,
109                    UParseError    *pe,
110                    UErrorCode     *status) {
111
112    if (U_FAILURE(*status)) {
113        return NULL;
114    }
115    if (pattern == NULL || patternLength < -1 || patternLength == 0) {
116        *status = U_ILLEGAL_ARGUMENT_ERROR;
117        return NULL;
118    }
119    int32_t actualPatLen = patternLength;
120    if (actualPatLen == -1) {
121        actualPatLen = u_strlen(pattern);
122    }
123
124    RegularExpression *re     = new RegularExpression;
125    int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
126    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
127    if (re == NULL || refC == NULL || patBuf == NULL) {
128        *status = U_MEMORY_ALLOCATION_ERROR;
129        delete re;
130        uprv_free(refC);
131        uprv_free(patBuf);
132        return NULL;
133    }
134    re->fPatRefCount = refC;
135    *re->fPatRefCount = 1;
136
137    //
138    // Make a copy of the pattern string, so we can return it later if asked.
139    //    For compiling the pattern, we will use a UText wrapper around
140    //    this local copy, to avoid making even more copies.
141    //
142    re->fPatString    = patBuf;
143    re->fPatStringLen = patternLength;
144    u_memcpy(patBuf, pattern, actualPatLen);
145    patBuf[actualPatLen] = 0;
146
147    UText patText = UTEXT_INITIALIZER;
148    utext_openUChars(&patText, patBuf, patternLength, status);
149
150    //
151    // Compile the pattern
152    //
153    if (pe != NULL) {
154        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
155    } else {
156        re->fPat = RegexPattern::compile(&patText, flags, *status);
157    }
158    utext_close(&patText);
159
160    if (U_FAILURE(*status)) {
161        goto ErrorExit;
162    }
163
164    //
165    // Create the matcher object
166    //
167    re->fMatcher = re->fPat->matcher(*status);
168    if (U_SUCCESS(*status)) {
169        return (URegularExpression*)re;
170    }
171
172ErrorExit:
173    delete re;
174    return NULL;
175
176}
177
178//----------------------------------------------------------------------------------------
179//
180//    uregex_openUText
181//
182//----------------------------------------------------------------------------------------
183U_CAPI URegularExpression *  U_EXPORT2
184uregex_openUText(UText          *pattern,
185                 uint32_t        flags,
186                 UParseError    *pe,
187                 UErrorCode     *status) {
188
189    if (U_FAILURE(*status)) {
190        return NULL;
191    }
192    if (pattern == NULL) {
193        *status = U_ILLEGAL_ARGUMENT_ERROR;
194        return NULL;
195    }
196
197    int64_t patternNativeLength = utext_nativeLength(pattern);
198
199    if (patternNativeLength == 0) {
200        *status = U_ILLEGAL_ARGUMENT_ERROR;
201        return NULL;
202    }
203
204    RegularExpression *re     = new RegularExpression;
205
206    UErrorCode lengthStatus = U_ZERO_ERROR;
207    int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
208
209    int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
210    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
211    if (re == NULL || refC == NULL || patBuf == NULL) {
212        *status = U_MEMORY_ALLOCATION_ERROR;
213        delete re;
214        uprv_free(refC);
215        uprv_free(patBuf);
216        return NULL;
217    }
218    re->fPatRefCount = refC;
219    *re->fPatRefCount = 1;
220
221    //
222    // Make a copy of the pattern string, so we can return it later if asked.
223    //    For compiling the pattern, we will use a read-only UText wrapper
224    //    around this local copy, to avoid making even more copies.
225    //
226    re->fPatString    = patBuf;
227    re->fPatStringLen = pattern16Length;
228    utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
229
230    UText patText = UTEXT_INITIALIZER;
231    utext_openUChars(&patText, patBuf, pattern16Length, status);
232
233    //
234    // Compile the pattern
235    //
236    if (pe != NULL) {
237        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
238    } else {
239        re->fPat = RegexPattern::compile(&patText, flags, *status);
240    }
241    utext_close(&patText);
242
243    if (U_FAILURE(*status)) {
244        goto ErrorExit;
245    }
246
247    //
248    // Create the matcher object
249    //
250    re->fMatcher = re->fPat->matcher(*status);
251    if (U_SUCCESS(*status)) {
252        return (URegularExpression*)re;
253    }
254
255ErrorExit:
256    delete re;
257    return NULL;
258
259}
260
261//----------------------------------------------------------------------------------------
262//
263//    uregex_close
264//
265//----------------------------------------------------------------------------------------
266U_CAPI void  U_EXPORT2
267uregex_close(URegularExpression  *re2) {
268    RegularExpression *re = (RegularExpression*)re2;
269    UErrorCode  status = U_ZERO_ERROR;
270    if (validateRE(re, FALSE, &status) == FALSE) {
271        return;
272    }
273    delete re;
274}
275
276
277//----------------------------------------------------------------------------------------
278//
279//    uregex_clone
280//
281//----------------------------------------------------------------------------------------
282U_CAPI URegularExpression * U_EXPORT2
283uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
284    RegularExpression *source = (RegularExpression*)source2;
285    if (validateRE(source, FALSE, status) == FALSE) {
286        return NULL;
287    }
288
289    RegularExpression *clone = new RegularExpression;
290    if (clone == NULL) {
291        *status = U_MEMORY_ALLOCATION_ERROR;
292        return NULL;
293    }
294
295    clone->fMatcher = source->fPat->matcher(*status);
296    if (U_FAILURE(*status)) {
297        delete clone;
298        return NULL;
299    }
300
301    clone->fPat          = source->fPat;
302    clone->fPatRefCount  = source->fPatRefCount;
303    clone->fPatString    = source->fPatString;
304    clone->fPatStringLen = source->fPatStringLen;
305    umtx_atomic_inc(source->fPatRefCount);
306    // Note:  fText is not cloned.
307
308    return (URegularExpression*)clone;
309}
310
311
312
313
314//------------------------------------------------------------------------------
315//
316//    uregex_pattern
317//
318//------------------------------------------------------------------------------
319U_CAPI const UChar * U_EXPORT2
320uregex_pattern(const  URegularExpression *regexp2,
321                      int32_t            *patLength,
322                      UErrorCode         *status)  {
323    RegularExpression *regexp = (RegularExpression*)regexp2;
324
325    if (validateRE(regexp, FALSE, status) == FALSE) {
326        return NULL;
327    }
328    if (patLength != NULL) {
329        *patLength = regexp->fPatStringLen;
330    }
331    return regexp->fPatString;
332}
333
334
335//------------------------------------------------------------------------------
336//
337//    uregex_patternUText
338//
339//------------------------------------------------------------------------------
340U_CAPI UText * U_EXPORT2
341uregex_patternUText(const URegularExpression *regexp2,
342                          UErrorCode         *status)  {
343    RegularExpression *regexp = (RegularExpression*)regexp2;
344    return regexp->fPat->patternText(*status);
345}
346
347
348//------------------------------------------------------------------------------
349//
350//    uregex_flags
351//
352//------------------------------------------------------------------------------
353U_CAPI int32_t U_EXPORT2
354uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
355    RegularExpression *regexp = (RegularExpression*)regexp2;
356    if (validateRE(regexp, FALSE, status) == FALSE) {
357        return 0;
358    }
359    int32_t flags = regexp->fPat->flags();
360    return flags;
361}
362
363
364//------------------------------------------------------------------------------
365//
366//    uregex_setText
367//
368//------------------------------------------------------------------------------
369U_CAPI void U_EXPORT2
370uregex_setText(URegularExpression *regexp2,
371               const UChar        *text,
372               int32_t             textLength,
373               UErrorCode         *status)  {
374    RegularExpression *regexp = (RegularExpression*)regexp2;
375    if (validateRE(regexp, FALSE, status) == FALSE) {
376        return;
377    }
378    if (text == NULL || textLength < -1) {
379        *status = U_ILLEGAL_ARGUMENT_ERROR;
380        return;
381    }
382
383    if (regexp->fOwnsText && regexp->fText != NULL) {
384        uprv_free((void *)regexp->fText);
385    }
386
387    regexp->fText       = text;
388    regexp->fTextLength = textLength;
389    regexp->fOwnsText   = FALSE;
390
391    UText input = UTEXT_INITIALIZER;
392    utext_openUChars(&input, text, textLength, status);
393    regexp->fMatcher->reset(&input);
394    utext_close(&input); // reset() made a shallow clone, so we don't need this copy
395}
396
397
398//------------------------------------------------------------------------------
399//
400//    uregex_setUText
401//
402//------------------------------------------------------------------------------
403U_CAPI void U_EXPORT2
404uregex_setUText(URegularExpression *regexp2,
405                UText              *text,
406                UErrorCode         *status) {
407    RegularExpression *regexp = (RegularExpression*)regexp2;
408    if (validateRE(regexp, FALSE, status) == FALSE) {
409        return;
410    }
411    if (text == NULL) {
412        *status = U_ILLEGAL_ARGUMENT_ERROR;
413        return;
414    }
415
416    if (regexp->fOwnsText && regexp->fText != NULL) {
417        uprv_free((void *)regexp->fText);
418    }
419
420    regexp->fText       = NULL; // only fill it in on request
421    regexp->fTextLength = -1;
422    regexp->fOwnsText   = TRUE;
423    regexp->fMatcher->reset(text);
424}
425
426
427
428//------------------------------------------------------------------------------
429//
430//    uregex_getText
431//
432//------------------------------------------------------------------------------
433U_CAPI const UChar * U_EXPORT2
434uregex_getText(URegularExpression *regexp2,
435               int32_t            *textLength,
436               UErrorCode         *status)  {
437    RegularExpression *regexp = (RegularExpression*)regexp2;
438    if (validateRE(regexp, FALSE, status) == FALSE) {
439        return NULL;
440    }
441
442    if (regexp->fText == NULL) {
443        // need to fill in the text
444        UText *inputText = regexp->fMatcher->inputText();
445        int64_t inputNativeLength = utext_nativeLength(inputText);
446        if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
447            regexp->fText = inputText->chunkContents;
448            regexp->fTextLength = (int32_t)inputNativeLength;
449            regexp->fOwnsText = FALSE; // because the UText owns it
450        } else {
451            UErrorCode lengthStatus = U_ZERO_ERROR;
452            regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
453            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
454
455            utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
456            regexp->fText = inputChars;
457            regexp->fOwnsText = TRUE; // should already be set but just in case
458        }
459    }
460
461    if (textLength != NULL) {
462        *textLength = regexp->fTextLength;
463    }
464    return regexp->fText;
465}
466
467
468//------------------------------------------------------------------------------
469//
470//    uregex_getUText
471//
472//------------------------------------------------------------------------------
473U_CAPI UText * U_EXPORT2
474uregex_getUText(URegularExpression *regexp2,
475                UText              *dest,
476                UErrorCode         *status)  {
477    RegularExpression *regexp = (RegularExpression*)regexp2;
478    if (validateRE(regexp, FALSE, status) == FALSE) {
479        return dest;
480    }
481    return regexp->fMatcher->getInput(dest, *status);
482}
483
484
485//------------------------------------------------------------------------------
486//
487//    uregex_refreshUText
488//
489//------------------------------------------------------------------------------
490U_CAPI void U_EXPORT2
491uregex_refreshUText(URegularExpression *regexp2,
492                    UText              *text,
493                    UErrorCode         *status) {
494    RegularExpression *regexp = (RegularExpression*)regexp2;
495    if (validateRE(regexp, FALSE, status) == FALSE) {
496        return;
497    }
498    regexp->fMatcher->refreshInputText(text, *status);
499}
500
501
502//------------------------------------------------------------------------------
503//
504//    uregex_matches
505//
506//------------------------------------------------------------------------------
507U_CAPI UBool U_EXPORT2
508uregex_matches(URegularExpression *regexp2,
509               int32_t            startIndex,
510               UErrorCode        *status)  {
511    return uregex_matches64( regexp2, (int64_t)startIndex, status);
512}
513
514U_CAPI UBool U_EXPORT2
515uregex_matches64(URegularExpression *regexp2,
516                 int64_t            startIndex,
517                 UErrorCode        *status)  {
518    RegularExpression *regexp = (RegularExpression*)regexp2;
519    UBool result = FALSE;
520    if (validateRE(regexp, TRUE, status) == FALSE) {
521        return result;
522    }
523    if (startIndex == -1) {
524        result = regexp->fMatcher->matches(*status);
525    } else {
526        result = regexp->fMatcher->matches(startIndex, *status);
527    }
528    return result;
529}
530
531
532//------------------------------------------------------------------------------
533//
534//    uregex_lookingAt
535//
536//------------------------------------------------------------------------------
537U_CAPI UBool U_EXPORT2
538uregex_lookingAt(URegularExpression *regexp2,
539                 int32_t             startIndex,
540                 UErrorCode         *status)  {
541    return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
542}
543
544U_CAPI UBool U_EXPORT2
545uregex_lookingAt64(URegularExpression *regexp2,
546                   int64_t             startIndex,
547                   UErrorCode         *status)  {
548    RegularExpression *regexp = (RegularExpression*)regexp2;
549    UBool result = FALSE;
550    if (validateRE(regexp, TRUE, status) == FALSE) {
551        return result;
552    }
553    if (startIndex == -1) {
554        result = regexp->fMatcher->lookingAt(*status);
555    } else {
556        result = regexp->fMatcher->lookingAt(startIndex, *status);
557    }
558    return result;
559}
560
561
562
563//------------------------------------------------------------------------------
564//
565//    uregex_find
566//
567//------------------------------------------------------------------------------
568U_CAPI UBool U_EXPORT2
569uregex_find(URegularExpression *regexp2,
570            int32_t             startIndex,
571            UErrorCode         *status)  {
572    return uregex_find64( regexp2, (int64_t)startIndex, status);
573}
574
575U_CAPI UBool U_EXPORT2
576uregex_find64(URegularExpression *regexp2,
577              int64_t             startIndex,
578              UErrorCode         *status)  {
579    RegularExpression *regexp = (RegularExpression*)regexp2;
580    UBool result = FALSE;
581    if (validateRE(regexp, TRUE, status) == FALSE) {
582        return result;
583    }
584    if (startIndex == -1) {
585        regexp->fMatcher->resetPreserveRegion();
586        result = regexp->fMatcher->find();
587    } else {
588        result = regexp->fMatcher->find(startIndex, *status);
589    }
590    return result;
591}
592
593
594//------------------------------------------------------------------------------
595//
596//    uregex_findNext
597//
598//------------------------------------------------------------------------------
599U_CAPI UBool U_EXPORT2
600uregex_findNext(URegularExpression *regexp2,
601                UErrorCode         *status)  {
602    RegularExpression *regexp = (RegularExpression*)regexp2;
603    if (validateRE(regexp, TRUE, status) == FALSE) {
604        return FALSE;
605    }
606    UBool result = regexp->fMatcher->find();
607    return result;
608}
609
610//------------------------------------------------------------------------------
611//
612//    uregex_groupCount
613//
614//------------------------------------------------------------------------------
615U_CAPI int32_t U_EXPORT2
616uregex_groupCount(URegularExpression *regexp2,
617                  UErrorCode         *status)  {
618    RegularExpression *regexp = (RegularExpression*)regexp2;
619    if (validateRE(regexp, FALSE, status) == FALSE) {
620        return 0;
621    }
622    int32_t  result = regexp->fMatcher->groupCount();
623    return result;
624}
625
626
627//------------------------------------------------------------------------------
628//
629//    uregex_group
630//
631//------------------------------------------------------------------------------
632U_CAPI int32_t U_EXPORT2
633uregex_group(URegularExpression *regexp2,
634             int32_t             groupNum,
635             UChar              *dest,
636             int32_t             destCapacity,
637             UErrorCode          *status)  {
638    RegularExpression *regexp = (RegularExpression*)regexp2;
639    if (validateRE(regexp, TRUE, status) == FALSE) {
640        return 0;
641    }
642    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
643        *status = U_ILLEGAL_ARGUMENT_ERROR;
644        return 0;
645    }
646
647    if (destCapacity == 0 || regexp->fText != NULL) {
648        // If preflighting or if we already have the text as UChars,
649        // this is a little cheaper than going through uregex_groupUTextDeep()
650
651        //
652        // Pick up the range of characters from the matcher
653        //
654        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
655        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
656        if (U_FAILURE(*status)) {
657            return 0;
658        }
659
660        //
661        // Trim length based on buffer capacity
662        //
663        int32_t fullLength = endIx - startIx;
664        int32_t copyLength = fullLength;
665        if (copyLength < destCapacity) {
666            dest[copyLength] = 0;
667        } else if (copyLength == destCapacity) {
668            *status = U_STRING_NOT_TERMINATED_WARNING;
669        } else {
670            copyLength = destCapacity;
671            *status = U_BUFFER_OVERFLOW_ERROR;
672        }
673
674        //
675        // Copy capture group to user's buffer
676        //
677        if (copyLength > 0) {
678            u_memcpy(dest, &regexp->fText[startIx], copyLength);
679        }
680        return fullLength;
681    } else {
682        UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
683        int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
684        utext_close(groupText);
685        return result;
686    }
687}
688
689
690//------------------------------------------------------------------------------
691//
692//    uregex_groupUText
693//
694//------------------------------------------------------------------------------
695U_CAPI UText * U_EXPORT2
696uregex_groupUText(URegularExpression *regexp2,
697                  int32_t             groupNum,
698                  UText              *dest,
699                  int64_t            *groupLength,
700                  UErrorCode         *status)  {
701    RegularExpression *regexp = (RegularExpression*)regexp2;
702    if (validateRE(regexp, TRUE, status) == FALSE) {
703        UErrorCode emptyTextStatus = U_ZERO_ERROR;
704        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
705    }
706
707    return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
708}
709
710//------------------------------------------------------------------------------
711//
712//    uregex_groupUTextDeep
713//
714//------------------------------------------------------------------------------
715U_CAPI UText * U_EXPORT2
716uregex_groupUTextDeep(URegularExpression *regexp2,
717                  int32_t             groupNum,
718                  UText              *dest,
719                  UErrorCode         *status)  {
720    RegularExpression *regexp = (RegularExpression*)regexp2;
721    if (validateRE(regexp, TRUE, status) == FALSE) {
722        UErrorCode emptyTextStatus = U_ZERO_ERROR;
723        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
724    }
725
726    if (regexp->fText != NULL) {
727        //
728        // Pick up the range of characters from the matcher
729        // and use our already-extracted characters
730        //
731        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
732        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
733        if (U_FAILURE(*status)) {
734            UErrorCode emptyTextStatus = U_ZERO_ERROR;
735            return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
736        }
737
738        if (dest) {
739            utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
740        } else {
741            UText groupText = UTEXT_INITIALIZER;
742            utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
743            dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
744            utext_close(&groupText);
745        }
746
747        return dest;
748    } else {
749        return regexp->fMatcher->group(groupNum, dest, *status);
750    }
751}
752
753//------------------------------------------------------------------------------
754//
755//    uregex_start
756//
757//------------------------------------------------------------------------------
758U_CAPI int32_t U_EXPORT2
759uregex_start(URegularExpression *regexp2,
760             int32_t             groupNum,
761             UErrorCode          *status)  {
762    return (int32_t)uregex_start64( regexp2, groupNum, status);
763}
764
765U_CAPI int64_t U_EXPORT2
766uregex_start64(URegularExpression *regexp2,
767               int32_t             groupNum,
768               UErrorCode          *status)  {
769    RegularExpression *regexp = (RegularExpression*)regexp2;
770    if (validateRE(regexp, TRUE, status) == FALSE) {
771        return 0;
772    }
773    int32_t result = regexp->fMatcher->start(groupNum, *status);
774    return result;
775}
776
777//------------------------------------------------------------------------------
778//
779//    uregex_end
780//
781//------------------------------------------------------------------------------
782U_CAPI int32_t U_EXPORT2
783uregex_end(URegularExpression   *regexp2,
784           int32_t               groupNum,
785           UErrorCode           *status)  {
786    return (int32_t)uregex_end64( regexp2, groupNum, status);
787}
788
789U_CAPI int64_t U_EXPORT2
790uregex_end64(URegularExpression   *regexp2,
791             int32_t               groupNum,
792             UErrorCode           *status)  {
793    RegularExpression *regexp = (RegularExpression*)regexp2;
794    if (validateRE(regexp, TRUE, status) == FALSE) {
795        return 0;
796    }
797    int32_t result = regexp->fMatcher->end(groupNum, *status);
798    return result;
799}
800
801//------------------------------------------------------------------------------
802//
803//    uregex_reset
804//
805//------------------------------------------------------------------------------
806U_CAPI void U_EXPORT2
807uregex_reset(URegularExpression    *regexp2,
808             int32_t               index,
809             UErrorCode            *status)  {
810    uregex_reset64( regexp2, (int64_t)index, status);
811}
812
813U_CAPI void U_EXPORT2
814uregex_reset64(URegularExpression    *regexp2,
815               int64_t               index,
816               UErrorCode            *status)  {
817    RegularExpression *regexp = (RegularExpression*)regexp2;
818    if (validateRE(regexp, TRUE, status) == FALSE) {
819        return;
820    }
821    regexp->fMatcher->reset(index, *status);
822}
823
824
825//------------------------------------------------------------------------------
826//
827//    uregex_setRegion
828//
829//------------------------------------------------------------------------------
830U_CAPI void U_EXPORT2
831uregex_setRegion(URegularExpression   *regexp2,
832                 int32_t               regionStart,
833                 int32_t               regionLimit,
834                 UErrorCode           *status)  {
835    uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
836}
837
838U_CAPI void U_EXPORT2
839uregex_setRegion64(URegularExpression   *regexp2,
840                   int64_t               regionStart,
841                   int64_t               regionLimit,
842                   UErrorCode           *status)  {
843    RegularExpression *regexp = (RegularExpression*)regexp2;
844    if (validateRE(regexp, TRUE, status) == FALSE) {
845        return;
846    }
847    regexp->fMatcher->region(regionStart, regionLimit, *status);
848}
849
850
851//------------------------------------------------------------------------------
852//
853//    uregex_setRegionAndStart
854//
855//------------------------------------------------------------------------------
856U_DRAFT void U_EXPORT2
857uregex_setRegionAndStart(URegularExpression   *regexp2,
858                 int64_t               regionStart,
859                 int64_t               regionLimit,
860                 int64_t               startIndex,
861                 UErrorCode           *status)  {
862    RegularExpression *regexp = (RegularExpression*)regexp2;
863    if (validateRE(regexp, TRUE, status) == FALSE) {
864        return;
865    }
866    regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
867}
868
869//------------------------------------------------------------------------------
870//
871//    uregex_regionStart
872//
873//------------------------------------------------------------------------------
874U_CAPI int32_t U_EXPORT2
875uregex_regionStart(const  URegularExpression   *regexp2,
876                          UErrorCode           *status)  {
877    return (int32_t)uregex_regionStart64(regexp2, status);
878}
879
880U_CAPI int64_t U_EXPORT2
881uregex_regionStart64(const  URegularExpression   *regexp2,
882                            UErrorCode           *status)  {
883    RegularExpression *regexp = (RegularExpression*)regexp2;
884    if (validateRE(regexp, TRUE, status) == FALSE) {
885        return 0;
886    }
887    return regexp->fMatcher->regionStart();
888}
889
890
891//------------------------------------------------------------------------------
892//
893//    uregex_regionEnd
894//
895//------------------------------------------------------------------------------
896U_CAPI int32_t U_EXPORT2
897uregex_regionEnd(const  URegularExpression   *regexp2,
898                        UErrorCode           *status)  {
899    return (int32_t)uregex_regionEnd64(regexp2, status);
900}
901
902U_CAPI int64_t U_EXPORT2
903uregex_regionEnd64(const  URegularExpression   *regexp2,
904                          UErrorCode           *status)  {
905    RegularExpression *regexp = (RegularExpression*)regexp2;
906    if (validateRE(regexp, TRUE, status) == FALSE) {
907        return 0;
908    }
909    return regexp->fMatcher->regionEnd();
910}
911
912
913//------------------------------------------------------------------------------
914//
915//    uregex_hasTransparentBounds
916//
917//------------------------------------------------------------------------------
918U_CAPI UBool U_EXPORT2
919uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
920                                   UErrorCode           *status)  {
921    RegularExpression *regexp = (RegularExpression*)regexp2;
922    if (validateRE(regexp, FALSE, status) == FALSE) {
923        return FALSE;
924    }
925    return regexp->fMatcher->hasTransparentBounds();
926}
927
928
929//------------------------------------------------------------------------------
930//
931//    uregex_useTransparentBounds
932//
933//------------------------------------------------------------------------------
934U_CAPI void U_EXPORT2
935uregex_useTransparentBounds(URegularExpression    *regexp2,
936                            UBool                  b,
937                            UErrorCode            *status)  {
938    RegularExpression *regexp = (RegularExpression*)regexp2;
939    if (validateRE(regexp, FALSE, status) == FALSE) {
940        return;
941    }
942    regexp->fMatcher->useTransparentBounds(b);
943}
944
945
946//------------------------------------------------------------------------------
947//
948//    uregex_hasAnchoringBounds
949//
950//------------------------------------------------------------------------------
951U_CAPI UBool U_EXPORT2
952uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
953                                 UErrorCode           *status)  {
954    RegularExpression *regexp = (RegularExpression*)regexp2;
955    if (validateRE(regexp, FALSE, status) == FALSE) {
956        return FALSE;
957    }
958    return regexp->fMatcher->hasAnchoringBounds();
959}
960
961
962//------------------------------------------------------------------------------
963//
964//    uregex_useAnchoringBounds
965//
966//------------------------------------------------------------------------------
967U_CAPI void U_EXPORT2
968uregex_useAnchoringBounds(URegularExpression    *regexp2,
969                          UBool                  b,
970                          UErrorCode            *status)  {
971    RegularExpression *regexp = (RegularExpression*)regexp2;
972    if (validateRE(regexp, FALSE, status) == FALSE) {
973        return;
974    }
975    regexp->fMatcher->useAnchoringBounds(b);
976}
977
978
979//------------------------------------------------------------------------------
980//
981//    uregex_hitEnd
982//
983//------------------------------------------------------------------------------
984U_CAPI UBool U_EXPORT2
985uregex_hitEnd(const  URegularExpression   *regexp2,
986                     UErrorCode           *status)  {
987    RegularExpression *regexp = (RegularExpression*)regexp2;
988    if (validateRE(regexp, TRUE, status) == FALSE) {
989        return FALSE;
990    }
991    return regexp->fMatcher->hitEnd();
992}
993
994
995//------------------------------------------------------------------------------
996//
997//    uregex_requireEnd
998//
999//------------------------------------------------------------------------------
1000U_CAPI UBool U_EXPORT2
1001uregex_requireEnd(const  URegularExpression   *regexp2,
1002                         UErrorCode           *status)  {
1003    RegularExpression *regexp = (RegularExpression*)regexp2;
1004    if (validateRE(regexp, TRUE, status) == FALSE) {
1005        return FALSE;
1006    }
1007    return regexp->fMatcher->requireEnd();
1008}
1009
1010
1011//------------------------------------------------------------------------------
1012//
1013//    uregex_setTimeLimit
1014//
1015//------------------------------------------------------------------------------
1016U_CAPI void U_EXPORT2
1017uregex_setTimeLimit(URegularExpression   *regexp2,
1018                    int32_t               limit,
1019                    UErrorCode           *status) {
1020    RegularExpression *regexp = (RegularExpression*)regexp2;
1021    if (validateRE(regexp, FALSE, status)) {
1022        regexp->fMatcher->setTimeLimit(limit, *status);
1023    }
1024}
1025
1026
1027
1028//------------------------------------------------------------------------------
1029//
1030//    uregex_getTimeLimit
1031//
1032//------------------------------------------------------------------------------
1033U_CAPI int32_t U_EXPORT2
1034uregex_getTimeLimit(const  URegularExpression   *regexp2,
1035                           UErrorCode           *status) {
1036    int32_t retVal = 0;
1037    RegularExpression *regexp = (RegularExpression*)regexp2;
1038    if (validateRE(regexp, FALSE, status)) {
1039        retVal = regexp->fMatcher->getTimeLimit();
1040    }
1041    return retVal;
1042}
1043
1044
1045
1046//------------------------------------------------------------------------------
1047//
1048//    uregex_setStackLimit
1049//
1050//------------------------------------------------------------------------------
1051U_CAPI void U_EXPORT2
1052uregex_setStackLimit(URegularExpression   *regexp2,
1053                     int32_t               limit,
1054                     UErrorCode           *status) {
1055    RegularExpression *regexp = (RegularExpression*)regexp2;
1056    if (validateRE(regexp, FALSE, status)) {
1057        regexp->fMatcher->setStackLimit(limit, *status);
1058    }
1059}
1060
1061
1062
1063//------------------------------------------------------------------------------
1064//
1065//    uregex_getStackLimit
1066//
1067//------------------------------------------------------------------------------
1068U_CAPI int32_t U_EXPORT2
1069uregex_getStackLimit(const  URegularExpression   *regexp2,
1070                            UErrorCode           *status) {
1071    int32_t retVal = 0;
1072    RegularExpression *regexp = (RegularExpression*)regexp2;
1073    if (validateRE(regexp, FALSE, status)) {
1074        retVal = regexp->fMatcher->getStackLimit();
1075    }
1076    return retVal;
1077}
1078
1079
1080//------------------------------------------------------------------------------
1081//
1082//    uregex_setMatchCallback
1083//
1084//------------------------------------------------------------------------------
1085U_CAPI void U_EXPORT2
1086uregex_setMatchCallback(URegularExpression      *regexp2,
1087                        URegexMatchCallback     *callback,
1088                        const void              *context,
1089                        UErrorCode              *status) {
1090    RegularExpression *regexp = (RegularExpression*)regexp2;
1091    if (validateRE(regexp, FALSE, status)) {
1092        regexp->fMatcher->setMatchCallback(callback, context, *status);
1093    }
1094}
1095
1096
1097//------------------------------------------------------------------------------
1098//
1099//    uregex_getMatchCallback
1100//
1101//------------------------------------------------------------------------------
1102U_CAPI void U_EXPORT2
1103uregex_getMatchCallback(const URegularExpression    *regexp2,
1104                        URegexMatchCallback        **callback,
1105                        const void                 **context,
1106                        UErrorCode                  *status) {
1107    RegularExpression *regexp = (RegularExpression*)regexp2;
1108     if (validateRE(regexp, FALSE, status)) {
1109         regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1110     }
1111}
1112
1113
1114//------------------------------------------------------------------------------
1115//
1116//    uregex_setMatchProgressCallback
1117//
1118//------------------------------------------------------------------------------
1119U_CAPI void U_EXPORT2
1120uregex_setFindProgressCallback(URegularExpression              *regexp2,
1121                                URegexFindProgressCallback      *callback,
1122                                const void                      *context,
1123                                UErrorCode                      *status) {
1124    RegularExpression *regexp = (RegularExpression*)regexp2;
1125    if (validateRE(regexp, FALSE, status)) {
1126        regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1127    }
1128}
1129
1130
1131//------------------------------------------------------------------------------
1132//
1133//    uregex_getMatchCallback
1134//
1135//------------------------------------------------------------------------------
1136U_CAPI void U_EXPORT2
1137uregex_getFindProgressCallback(const URegularExpression          *regexp2,
1138                                URegexFindProgressCallback        **callback,
1139                                const void                        **context,
1140                                UErrorCode                        *status) {
1141    RegularExpression *regexp = (RegularExpression*)regexp2;
1142     if (validateRE(regexp, FALSE, status)) {
1143         regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1144     }
1145}
1146
1147
1148//------------------------------------------------------------------------------
1149//
1150//    uregex_replaceAll
1151//
1152//------------------------------------------------------------------------------
1153U_CAPI int32_t U_EXPORT2
1154uregex_replaceAll(URegularExpression    *regexp2,
1155                  const UChar           *replacementText,
1156                  int32_t                replacementLength,
1157                  UChar                 *destBuf,
1158                  int32_t                destCapacity,
1159                  UErrorCode            *status)  {
1160    RegularExpression *regexp = (RegularExpression*)regexp2;
1161    if (validateRE(regexp, TRUE, status) == FALSE) {
1162        return 0;
1163    }
1164    if (replacementText == NULL || replacementLength < -1 ||
1165        (destBuf == NULL && destCapacity > 0) ||
1166        destCapacity < 0) {
1167        *status = U_ILLEGAL_ARGUMENT_ERROR;
1168        return 0;
1169    }
1170
1171    int32_t   len = 0;
1172
1173    uregex_reset(regexp2, 0, status);
1174
1175    // Note: Seperate error code variables for findNext() and appendReplacement()
1176    //       are used so that destination buffer overflow errors
1177    //       in appendReplacement won't stop findNext() from working.
1178    //       appendReplacement() and appendTail() special case incoming buffer
1179    //       overflow errors, continuing to return the correct length.
1180    UErrorCode  findStatus = *status;
1181    while (uregex_findNext(regexp2, &findStatus)) {
1182        len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1183                                        &destBuf, &destCapacity, status);
1184    }
1185    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1186
1187    if (U_FAILURE(findStatus)) {
1188        // If anything went wrong with the findNext(), make that error trump
1189        //   whatever may have happened with the append() operations.
1190        //   Errors in findNext() are not expected.
1191        *status = findStatus;
1192    }
1193
1194    return len;
1195}
1196
1197
1198//------------------------------------------------------------------------------
1199//
1200//    uregex_replaceAllUText
1201//
1202//------------------------------------------------------------------------------
1203U_CAPI UText * U_EXPORT2
1204uregex_replaceAllUText(URegularExpression    *regexp2,
1205                       UText                 *replacementText,
1206                       UText                 *dest,
1207                       UErrorCode            *status)  {
1208    RegularExpression *regexp = (RegularExpression*)regexp2;
1209    if (validateRE(regexp, TRUE, status) == FALSE) {
1210        return 0;
1211    }
1212    if (replacementText == NULL) {
1213        *status = U_ILLEGAL_ARGUMENT_ERROR;
1214        return 0;
1215    }
1216
1217    dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1218    return dest;
1219}
1220
1221
1222//------------------------------------------------------------------------------
1223//
1224//    uregex_replaceFirst
1225//
1226//------------------------------------------------------------------------------
1227U_CAPI int32_t U_EXPORT2
1228uregex_replaceFirst(URegularExpression  *regexp2,
1229                    const UChar         *replacementText,
1230                    int32_t              replacementLength,
1231                    UChar               *destBuf,
1232                    int32_t              destCapacity,
1233                    UErrorCode          *status)  {
1234    RegularExpression *regexp = (RegularExpression*)regexp2;
1235    if (validateRE(regexp, TRUE, status) == FALSE) {
1236        return 0;
1237    }
1238    if (replacementText == NULL || replacementLength < -1 ||
1239        (destBuf == NULL && destCapacity > 0) ||
1240        destCapacity < 0) {
1241        *status = U_ILLEGAL_ARGUMENT_ERROR;
1242        return 0;
1243    }
1244
1245    int32_t   len = 0;
1246    UBool     findSucceeded;
1247    uregex_reset(regexp2, 0, status);
1248    findSucceeded = uregex_find(regexp2, 0, status);
1249    if (findSucceeded) {
1250        len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1251                                       &destBuf, &destCapacity, status);
1252    }
1253    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1254
1255    return len;
1256}
1257
1258
1259//------------------------------------------------------------------------------
1260//
1261//    uregex_replaceFirstUText
1262//
1263//------------------------------------------------------------------------------
1264U_CAPI UText * U_EXPORT2
1265uregex_replaceFirstUText(URegularExpression  *regexp2,
1266                         UText                 *replacementText,
1267                         UText                 *dest,
1268                         UErrorCode            *status)  {
1269    RegularExpression *regexp = (RegularExpression*)regexp2;
1270    if (validateRE(regexp, TRUE, status) == FALSE) {
1271        return 0;
1272    }
1273    if (replacementText == NULL) {
1274        *status = U_ILLEGAL_ARGUMENT_ERROR;
1275        return 0;
1276    }
1277
1278    dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1279    return dest;
1280}
1281
1282
1283//------------------------------------------------------------------------------
1284//
1285//    uregex_appendReplacement
1286//
1287//------------------------------------------------------------------------------
1288
1289U_NAMESPACE_BEGIN
1290//
1291//  Dummy class, because these functions need to be friends of class RegexMatcher,
1292//               and stand-alone C functions don't work as friends
1293//
1294class RegexCImpl {
1295 public:
1296   inline static  int32_t appendReplacement(RegularExpression    *regexp,
1297                      const UChar           *replacementText,
1298                      int32_t                replacementLength,
1299                      UChar                **destBuf,
1300                      int32_t               *destCapacity,
1301                      UErrorCode            *status);
1302
1303   inline static int32_t appendTail(RegularExpression    *regexp,
1304        UChar                **destBuf,
1305        int32_t               *destCapacity,
1306        UErrorCode            *status);
1307
1308    inline static int32_t split(RegularExpression    *regexp,
1309        UChar                 *destBuf,
1310        int32_t                destCapacity,
1311        int32_t               *requiredCapacity,
1312        UChar                 *destFields[],
1313        int32_t                destFieldsCapacity,
1314        UErrorCode            *status);
1315};
1316
1317U_NAMESPACE_END
1318
1319
1320
1321static const UChar BACKSLASH  = 0x5c;
1322static const UChar DOLLARSIGN = 0x24;
1323
1324//
1325//  Move a character to an output buffer, with bounds checking on the index.
1326//      Index advances even if capacity is exceeded, for preflight size computations.
1327//      This little sequence is used a LOT.
1328//
1329static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1330    if (*idx < bufCapacity) {
1331        buf[*idx] = c;
1332    }
1333    (*idx)++;
1334}
1335
1336
1337//
1338//  appendReplacement, the actual implementation.
1339//
1340int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
1341                                      const UChar           *replacementText,
1342                                      int32_t                replacementLength,
1343                                      UChar                **destBuf,
1344                                      int32_t               *destCapacity,
1345                                      UErrorCode            *status)  {
1346
1347    // If we come in with a buffer overflow error, don't suppress the operation.
1348    //  A series of appendReplacements, appendTail need to correctly preflight
1349    //  the buffer size when an overflow happens somewhere in the middle.
1350    UBool pendingBufferOverflow = FALSE;
1351    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1352        pendingBufferOverflow = TRUE;
1353        *status = U_ZERO_ERROR;
1354    }
1355
1356    //
1357    // Validate all paramters
1358    //
1359    if (validateRE(regexp, TRUE, status) == FALSE) {
1360        return 0;
1361    }
1362    if (replacementText == NULL || replacementLength < -1 ||
1363        destCapacity == NULL || destBuf == NULL ||
1364        (*destBuf == NULL && *destCapacity > 0) ||
1365        *destCapacity < 0) {
1366        *status = U_ILLEGAL_ARGUMENT_ERROR;
1367        return 0;
1368    }
1369
1370    RegexMatcher *m = regexp->fMatcher;
1371    if (m->fMatch == FALSE) {
1372        *status = U_REGEX_INVALID_STATE;
1373        return 0;
1374    }
1375
1376    UChar    *dest             = *destBuf;
1377    int32_t   capacity         = *destCapacity;
1378    int32_t   destIdx          =  0;
1379    int32_t   i;
1380
1381    // If it wasn't supplied by the caller,  get the length of the replacement text.
1382    //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
1383    //          the fly and avoid this step.
1384    if (replacementLength == -1) {
1385        replacementLength = u_strlen(replacementText);
1386    }
1387
1388    // Copy input string from the end of previous match to start of current match
1389    if (regexp->fText != NULL) {
1390        int32_t matchStart;
1391        int32_t lastMatchEnd;
1392        if (UTEXT_USES_U16(m->fInputText)) {
1393            lastMatchEnd = (int32_t)m->fLastMatchEnd;
1394            matchStart = (int32_t)m->fMatchStart;
1395        } else {
1396            // !!!: Would like a better way to do this!
1397            UErrorCode status = U_ZERO_ERROR;
1398            lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
1399            status = U_ZERO_ERROR;
1400            matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
1401        }
1402        for (i=lastMatchEnd; i<matchStart; i++) {
1403            appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1404        }
1405    } else {
1406        UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1407        destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1408                                 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
1409    }
1410
1411
1412    // scan the replacement text, looking for substitutions ($n) and \escapes.
1413    int32_t  replIdx = 0;
1414    while (replIdx < replacementLength) {
1415        UChar  c = replacementText[replIdx];
1416        replIdx++;
1417        if (c != DOLLARSIGN && c != BACKSLASH) {
1418            // Common case, no substitution, no escaping,
1419            //  just copy the char to the dest buf.
1420            appendToBuf(c, &destIdx, dest, capacity);
1421            continue;
1422        }
1423
1424        if (c == BACKSLASH) {
1425            // Backslash Escape.  Copy the following char out without further checks.
1426            //                    Note:  Surrogate pairs don't need any special handling
1427            //                           The second half wont be a '$' or a '\', and
1428            //                           will move to the dest normally on the next
1429            //                           loop iteration.
1430            if (replIdx >= replacementLength) {
1431                break;
1432            }
1433            c = replacementText[replIdx];
1434
1435            if (c==0x55/*U*/ || c==0x75/*u*/) {
1436                // We have a \udddd or \Udddddddd escape sequence.
1437                UChar32 escapedChar =
1438                    u_unescapeAt(uregex_ucstr_unescape_charAt,
1439                       &replIdx,                   // Index is updated by unescapeAt
1440                       replacementLength,          // Length of replacement text
1441                       (void *)replacementText);
1442
1443                if (escapedChar != (UChar32)0xFFFFFFFF) {
1444                    if (escapedChar <= 0xffff) {
1445                        appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1446                    } else {
1447                        appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1448                        appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1449                    }
1450                    continue;
1451                }
1452                // Note:  if the \u escape was invalid, just fall through and
1453                //        treat it as a plain \<anything> escape.
1454            }
1455
1456            // Plain backslash escape.  Just put out the escaped character.
1457            appendToBuf(c, &destIdx, dest, capacity);
1458
1459            replIdx++;
1460            continue;
1461        }
1462
1463
1464
1465        // We've got a $.  Pick up a capture group number if one follows.
1466        // Consume at most the number of digits necessary for the largest capture
1467        // number that is valid for this pattern.
1468
1469        int32_t numDigits = 0;
1470        int32_t groupNum  = 0;
1471        UChar32 digitC;
1472        for (;;) {
1473            if (replIdx >= replacementLength) {
1474                break;
1475            }
1476            U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
1477            if (u_isdigit(digitC) == FALSE) {
1478                break;
1479            }
1480
1481            U16_FWD_1(replacementText, replIdx, replacementLength);
1482            groupNum=groupNum*10 + u_charDigitValue(digitC);
1483            numDigits++;
1484            if (numDigits >= m->fPattern->fMaxCaptureDigits) {
1485                break;
1486            }
1487        }
1488
1489
1490        if (numDigits == 0) {
1491            // The $ didn't introduce a group number at all.
1492            // Treat it as just part of the substitution text.
1493            appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
1494            continue;
1495        }
1496
1497        // Finally, append the capture group data to the destination.
1498        destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1499        if (*status == U_BUFFER_OVERFLOW_ERROR) {
1500            // Ignore buffer overflow when extracting the group.  We need to
1501            //   continue on to get full size of the untruncated result.  We will
1502            //   raise our own buffer overflow error at the end.
1503            *status = U_ZERO_ERROR;
1504        }
1505
1506        if (U_FAILURE(*status)) {
1507            // Can fail if group number is out of range.
1508            break;
1509        }
1510
1511    }
1512
1513    //
1514    //  Nul Terminate the dest buffer if possible.
1515    //  Set the appropriate buffer overflow or not terminated error, if needed.
1516    //
1517    if (destIdx < capacity) {
1518        dest[destIdx] = 0;
1519    } else if (destIdx == *destCapacity) {
1520        *status = U_STRING_NOT_TERMINATED_WARNING;
1521    } else {
1522        *status = U_BUFFER_OVERFLOW_ERROR;
1523    }
1524
1525    //
1526    // Return an updated dest buffer and capacity to the caller.
1527    //
1528    if (destIdx > 0 &&  *destCapacity > 0) {
1529        if (destIdx < capacity) {
1530            *destBuf      += destIdx;
1531            *destCapacity -= destIdx;
1532        } else {
1533            *destBuf      += capacity;
1534            *destCapacity =  0;
1535        }
1536    }
1537
1538    // If we came in with a buffer overflow, make sure we go out with one also.
1539    //   (A zero length match right at the end of the previous match could
1540    //    make this function succeed even though a previous call had overflowed the buf)
1541    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1542        *status = U_BUFFER_OVERFLOW_ERROR;
1543    }
1544
1545    return destIdx;
1546}
1547
1548//
1549//   appendReplacement   the actual API function,
1550//
1551U_CAPI int32_t U_EXPORT2
1552uregex_appendReplacement(URegularExpression    *regexp2,
1553                         const UChar           *replacementText,
1554                         int32_t                replacementLength,
1555                         UChar                **destBuf,
1556                         int32_t               *destCapacity,
1557                         UErrorCode            *status) {
1558
1559    RegularExpression *regexp = (RegularExpression*)regexp2;
1560    return RegexCImpl::appendReplacement(
1561        regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1562}
1563
1564//
1565//   uregex_appendReplacementUText...can just use the normal C++ method
1566//
1567U_CAPI void U_EXPORT2
1568uregex_appendReplacementUText(URegularExpression    *regexp2,
1569                              UText                 *replText,
1570                              UText                 *dest,
1571                              UErrorCode            *status)  {
1572    RegularExpression *regexp = (RegularExpression*)regexp2;
1573    regexp->fMatcher->appendReplacement(dest, replText, *status);
1574}
1575
1576
1577//------------------------------------------------------------------------------
1578//
1579//    uregex_appendTail
1580//
1581//------------------------------------------------------------------------------
1582int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
1583                               UChar                **destBuf,
1584                               int32_t               *destCapacity,
1585                               UErrorCode            *status)
1586{
1587
1588    // If we come in with a buffer overflow error, don't suppress the operation.
1589    //  A series of appendReplacements, appendTail need to correctly preflight
1590    //  the buffer size when an overflow happens somewhere in the middle.
1591    UBool pendingBufferOverflow = FALSE;
1592    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1593        pendingBufferOverflow = TRUE;
1594        *status = U_ZERO_ERROR;
1595    }
1596
1597    if (validateRE(regexp, TRUE, status) == FALSE) {
1598        return 0;
1599    }
1600
1601    if (destCapacity == NULL || destBuf == NULL ||
1602        (*destBuf == NULL && *destCapacity > 0) ||
1603        *destCapacity < 0)
1604    {
1605        *status = U_ILLEGAL_ARGUMENT_ERROR;
1606        return 0;
1607    }
1608
1609    RegexMatcher *m = regexp->fMatcher;
1610
1611    int32_t  destIdx     = 0;
1612    int32_t  destCap     = *destCapacity;
1613    UChar    *dest       = *destBuf;
1614
1615    if (regexp->fText != NULL) {
1616        int32_t srcIdx;
1617        int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1618        if (nativeIdx == -1) {
1619            srcIdx = 0;
1620        } else if (UTEXT_USES_U16(m->fInputText)) {
1621            srcIdx = (int32_t)nativeIdx;
1622        } else {
1623            UErrorCode status = U_ZERO_ERROR;
1624            srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1625        }
1626
1627        for (;;) {
1628            if (srcIdx == regexp->fTextLength) {
1629                break;
1630            }
1631            UChar c = regexp->fText[srcIdx];
1632            if (c == 0 && regexp->fTextLength == -1) {
1633                regexp->fTextLength = srcIdx;
1634                break;
1635            }
1636            if (destIdx < destCap) {
1637                dest[destIdx] = c;
1638            } else {
1639                // We've overflowed the dest buffer.
1640                //  If the total input string length is known, we can
1641                //    compute the total buffer size needed without scanning through the string.
1642                if (regexp->fTextLength > 0) {
1643                    destIdx += (regexp->fTextLength - srcIdx);
1644                    break;
1645                }
1646            }
1647            srcIdx++;
1648            destIdx++;
1649        }
1650    } else {
1651        int64_t  srcIdx;
1652        if (m->fMatch) {
1653            // The most recent call to find() succeeded.
1654            srcIdx = m->fMatchEnd;
1655        } else {
1656            // The last call to find() on this matcher failed().
1657            //   Look back to the end of the last find() that succeeded for src index.
1658            srcIdx = m->fLastMatchEnd;
1659            if (srcIdx == -1)  {
1660                // There has been no successful match with this matcher.
1661                //   We want to copy the whole string.
1662                srcIdx = 0;
1663            }
1664        }
1665
1666        destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1667    }
1668
1669    //
1670    //  NUL terminate the output string, if possible, otherwise issue the
1671    //   appropriate error or warning.
1672    //
1673    if (destIdx < destCap) {
1674        dest[destIdx] = 0;
1675    } else  if (destIdx == destCap) {
1676        *status = U_STRING_NOT_TERMINATED_WARNING;
1677    } else {
1678        *status = U_BUFFER_OVERFLOW_ERROR;
1679    }
1680
1681    //
1682    // Update the user's buffer ptr and capacity vars to reflect the
1683    //   amount used.
1684    //
1685    if (destIdx < destCap) {
1686        *destBuf      += destIdx;
1687        *destCapacity -= destIdx;
1688    } else {
1689        *destBuf      += destCap;
1690        *destCapacity  = 0;
1691    }
1692
1693    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1694        *status = U_BUFFER_OVERFLOW_ERROR;
1695    }
1696
1697    return destIdx;
1698}
1699
1700
1701//
1702//   appendTail   the actual API function
1703//
1704U_CAPI int32_t U_EXPORT2
1705uregex_appendTail(URegularExpression    *regexp2,
1706                  UChar                **destBuf,
1707                  int32_t               *destCapacity,
1708                  UErrorCode            *status)  {
1709    RegularExpression *regexp = (RegularExpression*)regexp2;
1710    return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1711}
1712
1713
1714//
1715//   uregex_appendTailUText...can just use the normal C++ method
1716//
1717U_CAPI UText * U_EXPORT2
1718uregex_appendTailUText(URegularExpression    *regexp2,
1719                       UText                 *dest,
1720                       UErrorCode            *status)  {
1721    RegularExpression *regexp = (RegularExpression*)regexp2;
1722    return regexp->fMatcher->appendTail(dest, *status);
1723}
1724
1725
1726//------------------------------------------------------------------------------
1727//
1728//    copyString     Internal utility to copy a string to an output buffer,
1729//                   while managing buffer overflow and preflight size
1730//                   computation.  NUL termination is added to destination,
1731//                   and the NUL is counted in the output size.
1732//
1733//------------------------------------------------------------------------------
1734#if 0
1735static void copyString(UChar        *destBuffer,    //  Destination buffer.
1736                       int32_t       destCapacity,  //  Total capacity of dest buffer
1737                       int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
1738                                                    //    Update not clipped to destCapacity.
1739                       const UChar  *srcPtr,        //  Pointer to source string
1740                       int32_t       srcLen)        //  Source string len.
1741{
1742    int32_t  si;
1743    int32_t  di = *destIndex;
1744    UChar    c;
1745
1746    for (si=0; si<srcLen;  si++) {
1747        c = srcPtr[si];
1748        if (di < destCapacity) {
1749            destBuffer[di] = c;
1750            di++;
1751        } else {
1752            di += srcLen - si;
1753            break;
1754        }
1755    }
1756    if (di<destCapacity) {
1757        destBuffer[di] = 0;
1758    }
1759    di++;
1760    *destIndex = di;
1761}
1762#endif
1763
1764//------------------------------------------------------------------------------
1765//
1766//    uregex_split
1767//
1768//------------------------------------------------------------------------------
1769int32_t RegexCImpl::split(RegularExpression     *regexp,
1770                          UChar                 *destBuf,
1771                          int32_t                destCapacity,
1772                          int32_t               *requiredCapacity,
1773                          UChar                 *destFields[],
1774                          int32_t                destFieldsCapacity,
1775                          UErrorCode            *status) {
1776    //
1777    // Reset for the input text
1778    //
1779    regexp->fMatcher->reset();
1780    UText *inputText = regexp->fMatcher->fInputText;
1781    int64_t   nextOutputStringStart = 0;
1782    int64_t   inputLen = regexp->fMatcher->fInputLength;
1783    if (inputLen == 0) {
1784        return 0;
1785    }
1786
1787    //
1788    // Loop through the input text, searching for the delimiter pattern
1789    //
1790    int32_t   i;             // Index of the field being processed.
1791    int32_t   destIdx = 0;   // Next available position in destBuf;
1792    int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
1793    UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
1794    for (i=0; ; i++) {
1795        if (i>=destFieldsCapacity-1) {
1796            // There are one or zero output strings left.
1797            // Fill the last output string with whatever is left from the input, then exit the loop.
1798            //  ( i will be == destFieldsCapacity if we filled the output array while processing
1799            //    capture groups of the delimiter expression, in which case we will discard the
1800            //    last capture group saved in favor of the unprocessed remainder of the
1801            //    input string.)
1802            if (inputLen > nextOutputStringStart) {
1803                if (i != destFieldsCapacity-1) {
1804                    // No fields are left.  Recycle the last one for holding the trailing part of
1805                    //   the input string.
1806                    i = destFieldsCapacity-1;
1807                    destIdx = (int32_t)(destFields[i] - destFields[0]);
1808                }
1809
1810                destFields[i] = &destBuf[destIdx];
1811                destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1812                                             &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1813            }
1814            break;
1815        }
1816
1817        if (regexp->fMatcher->find()) {
1818            // We found another delimiter.  Move everything from where we started looking
1819            //  up until the start of the delimiter into the next output string.
1820            destFields[i] = &destBuf[destIdx];
1821
1822            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1823                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1824            if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1825                tStatus = U_ZERO_ERROR;
1826            } else {
1827                *status = tStatus;
1828            }
1829            nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1830
1831            // If the delimiter pattern has capturing parentheses, the captured
1832            //  text goes out into the next n destination strings.
1833            int32_t groupNum;
1834            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1835                // If we've run out of output string slots, bail out.
1836                if (i==destFieldsCapacity-1) {
1837                    break;
1838                }
1839                i++;
1840
1841                // Set up to extract the capture group contents into the dest buffer.
1842                destFields[i] = &destBuf[destIdx];
1843                tStatus = U_ZERO_ERROR;
1844                int32_t t = uregex_group((URegularExpression*)regexp,
1845                                         groupNum,
1846                                         destFields[i],
1847                                         REMAINING_CAPACITY(destIdx, destCapacity),
1848                                         &tStatus);
1849                destIdx += t + 1;    // Record the space used in the output string buffer.
1850                                     //  +1 for the NUL that terminates the string.
1851                if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1852                    tStatus = U_ZERO_ERROR;
1853                } else {
1854                    *status = tStatus;
1855                }
1856            }
1857
1858            if (nextOutputStringStart == inputLen) {
1859                // The delimiter was at the end of the string.
1860                // Output an empty string, and then we are done.
1861                if (destIdx < destCapacity) {
1862                    destBuf[destIdx] = 0;
1863                }
1864                if (i < destFieldsCapacity-1) {
1865                   ++i;
1866                }
1867                if (destIdx < destCapacity) {
1868                    destFields[i] = destBuf + destIdx;
1869                }
1870                ++destIdx;
1871                break;
1872            }
1873
1874        }
1875        else
1876        {
1877            // We ran off the end of the input while looking for the next delimiter.
1878            // All the remaining text goes into the current output string.
1879            destFields[i] = &destBuf[destIdx];
1880            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1881                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1882            break;
1883        }
1884    }
1885
1886    // Zero out any unused portion of the destFields array
1887    int j;
1888    for (j=i+1; j<destFieldsCapacity; j++) {
1889        destFields[j] = NULL;
1890    }
1891
1892    if (requiredCapacity != NULL) {
1893        *requiredCapacity = destIdx;
1894    }
1895    if (destIdx > destCapacity) {
1896        *status = U_BUFFER_OVERFLOW_ERROR;
1897    }
1898    return i+1;
1899}
1900
1901//
1902//   uregex_split   The actual API function
1903//
1904U_CAPI int32_t U_EXPORT2
1905uregex_split(URegularExpression      *regexp2,
1906             UChar                   *destBuf,
1907             int32_t                  destCapacity,
1908             int32_t                 *requiredCapacity,
1909             UChar                   *destFields[],
1910             int32_t                  destFieldsCapacity,
1911             UErrorCode              *status) {
1912    RegularExpression *regexp = (RegularExpression*)regexp2;
1913    if (validateRE(regexp, TRUE, status) == FALSE) {
1914        return 0;
1915    }
1916    if ((destBuf == NULL && destCapacity > 0) ||
1917        destCapacity < 0 ||
1918        destFields == NULL ||
1919        destFieldsCapacity < 1 ) {
1920        *status = U_ILLEGAL_ARGUMENT_ERROR;
1921        return 0;
1922    }
1923
1924    return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1925}
1926
1927
1928//
1929//   uregex_splitUText...can just use the normal C++ method
1930//
1931U_CAPI int32_t U_EXPORT2
1932uregex_splitUText(URegularExpression    *regexp2,
1933                  UText                 *destFields[],
1934                  int32_t                destFieldsCapacity,
1935                  UErrorCode            *status) {
1936    RegularExpression *regexp = (RegularExpression*)regexp2;
1937    return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1938}
1939
1940
1941#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1942
1943