1/*
2*******************************************************************************
3*   Copyright (C) 2004-2015, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  uregex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "unicode/utf16.h"
20#include "cmemory.h"
21#include "uassert.h"
22#include "uhash.h"
23#include "umutex.h"
24#include "uvectr32.h"
25
26#include "regextxt.h"
27
28U_NAMESPACE_BEGIN
29
30#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
31
32struct RegularExpression: public UMemory {
33public:
34    RegularExpression();
35    ~RegularExpression();
36    int32_t           fMagic;
37    RegexPattern     *fPat;
38    u_atomic_int32_t *fPatRefCount;
39    UChar            *fPatString;
40    int32_t           fPatStringLen;
41    RegexMatcher     *fMatcher;
42    const UChar      *fText;         // Text from setText()
43    int32_t           fTextLength;   // Length provided by user with setText(), which
44                                     //  may be -1.
45    UBool             fOwnsText;
46};
47
48static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
49
50RegularExpression::RegularExpression() {
51    fMagic        = REXP_MAGIC;
52    fPat          = NULL;
53    fPatRefCount  = NULL;
54    fPatString    = NULL;
55    fPatStringLen = 0;
56    fMatcher      = NULL;
57    fText         = NULL;
58    fTextLength   = 0;
59    fOwnsText     = FALSE;
60}
61
62RegularExpression::~RegularExpression() {
63    delete fMatcher;
64    fMatcher = NULL;
65    if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
66        delete fPat;
67        uprv_free(fPatString);
68        uprv_free((void *)fPatRefCount);
69    }
70    if (fOwnsText && fText!=NULL) {
71        uprv_free((void *)fText);
72    }
73    fMagic = 0;
74}
75
76U_NAMESPACE_END
77
78U_NAMESPACE_USE
79
80//----------------------------------------------------------------------------------------
81//
82//   validateRE    Do boilerplate style checks on API function parameters.
83//                 Return TRUE if they look OK.
84//----------------------------------------------------------------------------------------
85static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
86    if (U_FAILURE(*status)) {
87        return FALSE;
88    }
89    if (re == NULL || re->fMagic != REXP_MAGIC) {
90        *status = U_ILLEGAL_ARGUMENT_ERROR;
91        return FALSE;
92    }
93    // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94    if (requiresText && re->fText == NULL && !re->fOwnsText) {
95        *status = U_REGEX_INVALID_STATE;
96        return FALSE;
97    }
98    return TRUE;
99}
100
101//----------------------------------------------------------------------------------------
102//
103//    uregex_open
104//
105//----------------------------------------------------------------------------------------
106U_CAPI URegularExpression *  U_EXPORT2
107uregex_open( const  UChar          *pattern,
108                    int32_t         patternLength,
109                    uint32_t        flags,
110                    UParseError    *pe,
111                    UErrorCode     *status) {
112
113    if (U_FAILURE(*status)) {
114        return NULL;
115    }
116    if (pattern == NULL || patternLength < -1 || patternLength == 0) {
117        *status = U_ILLEGAL_ARGUMENT_ERROR;
118        return NULL;
119    }
120    int32_t actualPatLen = patternLength;
121    if (actualPatLen == -1) {
122        actualPatLen = u_strlen(pattern);
123    }
124
125    RegularExpression  *re     = new RegularExpression;
126    u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
127    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
128    if (re == NULL || refC == NULL || patBuf == NULL) {
129        *status = U_MEMORY_ALLOCATION_ERROR;
130        delete re;
131        uprv_free((void *)refC);
132        uprv_free(patBuf);
133        return NULL;
134    }
135    re->fPatRefCount = refC;
136    *re->fPatRefCount = 1;
137
138    //
139    // Make a copy of the pattern string, so we can return it later if asked.
140    //    For compiling the pattern, we will use a UText wrapper around
141    //    this local copy, to avoid making even more copies.
142    //
143    re->fPatString    = patBuf;
144    re->fPatStringLen = patternLength;
145    u_memcpy(patBuf, pattern, actualPatLen);
146    patBuf[actualPatLen] = 0;
147
148    UText patText = UTEXT_INITIALIZER;
149    utext_openUChars(&patText, patBuf, patternLength, status);
150
151    //
152    // Compile the pattern
153    //
154    if (pe != NULL) {
155        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
156    } else {
157        re->fPat = RegexPattern::compile(&patText, flags, *status);
158    }
159    utext_close(&patText);
160
161    if (U_FAILURE(*status)) {
162        goto ErrorExit;
163    }
164
165    //
166    // Create the matcher object
167    //
168    re->fMatcher = re->fPat->matcher(*status);
169    if (U_SUCCESS(*status)) {
170        return (URegularExpression*)re;
171    }
172
173ErrorExit:
174    delete re;
175    return NULL;
176
177}
178
179//----------------------------------------------------------------------------------------
180//
181//    uregex_openUText
182//
183//----------------------------------------------------------------------------------------
184U_CAPI URegularExpression *  U_EXPORT2
185uregex_openUText(UText          *pattern,
186                 uint32_t        flags,
187                 UParseError    *pe,
188                 UErrorCode     *status) {
189
190    if (U_FAILURE(*status)) {
191        return NULL;
192    }
193    if (pattern == NULL) {
194        *status = U_ILLEGAL_ARGUMENT_ERROR;
195        return NULL;
196    }
197
198    int64_t patternNativeLength = utext_nativeLength(pattern);
199
200    if (patternNativeLength == 0) {
201        *status = U_ILLEGAL_ARGUMENT_ERROR;
202        return NULL;
203    }
204
205    RegularExpression *re     = new RegularExpression;
206
207    UErrorCode lengthStatus = U_ZERO_ERROR;
208    int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
209
210    u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
211    UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
212    if (re == NULL || refC == NULL || patBuf == NULL) {
213        *status = U_MEMORY_ALLOCATION_ERROR;
214        delete re;
215        uprv_free((void *)refC);
216        uprv_free(patBuf);
217        return NULL;
218    }
219    re->fPatRefCount = refC;
220    *re->fPatRefCount = 1;
221
222    //
223    // Make a copy of the pattern string, so we can return it later if asked.
224    //    For compiling the pattern, we will use a read-only UText wrapper
225    //    around this local copy, to avoid making even more copies.
226    //
227    re->fPatString    = patBuf;
228    re->fPatStringLen = pattern16Length;
229    utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
230
231    UText patText = UTEXT_INITIALIZER;
232    utext_openUChars(&patText, patBuf, pattern16Length, status);
233
234    //
235    // Compile the pattern
236    //
237    if (pe != NULL) {
238        re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
239    } else {
240        re->fPat = RegexPattern::compile(&patText, flags, *status);
241    }
242    utext_close(&patText);
243
244    if (U_FAILURE(*status)) {
245        goto ErrorExit;
246    }
247
248    //
249    // Create the matcher object
250    //
251    re->fMatcher = re->fPat->matcher(*status);
252    if (U_SUCCESS(*status)) {
253        return (URegularExpression*)re;
254    }
255
256ErrorExit:
257    delete re;
258    return NULL;
259
260}
261
262//----------------------------------------------------------------------------------------
263//
264//    uregex_close
265//
266//----------------------------------------------------------------------------------------
267U_CAPI void  U_EXPORT2
268uregex_close(URegularExpression  *re2) {
269    RegularExpression *re = (RegularExpression*)re2;
270    UErrorCode  status = U_ZERO_ERROR;
271    if (validateRE(re, FALSE, &status) == FALSE) {
272        return;
273    }
274    delete re;
275}
276
277
278//----------------------------------------------------------------------------------------
279//
280//    uregex_clone
281//
282//----------------------------------------------------------------------------------------
283U_CAPI URegularExpression * U_EXPORT2
284uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
285    RegularExpression *source = (RegularExpression*)source2;
286    if (validateRE(source, FALSE, status) == FALSE) {
287        return NULL;
288    }
289
290    RegularExpression *clone = new RegularExpression;
291    if (clone == NULL) {
292        *status = U_MEMORY_ALLOCATION_ERROR;
293        return NULL;
294    }
295
296    clone->fMatcher = source->fPat->matcher(*status);
297    if (U_FAILURE(*status)) {
298        delete clone;
299        return NULL;
300    }
301
302    clone->fPat          = source->fPat;
303    clone->fPatRefCount  = source->fPatRefCount;
304    clone->fPatString    = source->fPatString;
305    clone->fPatStringLen = source->fPatStringLen;
306    umtx_atomic_inc(source->fPatRefCount);
307    // Note:  fText is not cloned.
308
309    return (URegularExpression*)clone;
310}
311
312
313
314
315//------------------------------------------------------------------------------
316//
317//    uregex_pattern
318//
319//------------------------------------------------------------------------------
320U_CAPI const UChar * U_EXPORT2
321uregex_pattern(const  URegularExpression *regexp2,
322                      int32_t            *patLength,
323                      UErrorCode         *status)  {
324    RegularExpression *regexp = (RegularExpression*)regexp2;
325
326    if (validateRE(regexp, FALSE, status) == FALSE) {
327        return NULL;
328    }
329    if (patLength != NULL) {
330        *patLength = regexp->fPatStringLen;
331    }
332    return regexp->fPatString;
333}
334
335
336//------------------------------------------------------------------------------
337//
338//    uregex_patternUText
339//
340//------------------------------------------------------------------------------
341U_CAPI UText * U_EXPORT2
342uregex_patternUText(const URegularExpression *regexp2,
343                          UErrorCode         *status)  {
344    RegularExpression *regexp = (RegularExpression*)regexp2;
345    return regexp->fPat->patternText(*status);
346}
347
348
349//------------------------------------------------------------------------------
350//
351//    uregex_flags
352//
353//------------------------------------------------------------------------------
354U_CAPI int32_t U_EXPORT2
355uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
356    RegularExpression *regexp = (RegularExpression*)regexp2;
357    if (validateRE(regexp, FALSE, status) == FALSE) {
358        return 0;
359    }
360    int32_t flags = regexp->fPat->flags();
361    return flags;
362}
363
364
365//------------------------------------------------------------------------------
366//
367//    uregex_setText
368//
369//------------------------------------------------------------------------------
370U_CAPI void U_EXPORT2
371uregex_setText(URegularExpression *regexp2,
372               const UChar        *text,
373               int32_t             textLength,
374               UErrorCode         *status)  {
375    RegularExpression *regexp = (RegularExpression*)regexp2;
376    if (validateRE(regexp, FALSE, status) == FALSE) {
377        return;
378    }
379    if (text == NULL || textLength < -1) {
380        *status = U_ILLEGAL_ARGUMENT_ERROR;
381        return;
382    }
383
384    if (regexp->fOwnsText && regexp->fText != NULL) {
385        uprv_free((void *)regexp->fText);
386    }
387
388    regexp->fText       = text;
389    regexp->fTextLength = textLength;
390    regexp->fOwnsText   = FALSE;
391
392    UText input = UTEXT_INITIALIZER;
393    utext_openUChars(&input, text, textLength, status);
394    regexp->fMatcher->reset(&input);
395    utext_close(&input); // reset() made a shallow clone, so we don't need this copy
396}
397
398
399//------------------------------------------------------------------------------
400//
401//    uregex_setUText
402//
403//------------------------------------------------------------------------------
404U_CAPI void U_EXPORT2
405uregex_setUText(URegularExpression *regexp2,
406                UText              *text,
407                UErrorCode         *status) {
408    RegularExpression *regexp = (RegularExpression*)regexp2;
409    if (validateRE(regexp, FALSE, status) == FALSE) {
410        return;
411    }
412    if (text == NULL) {
413        *status = U_ILLEGAL_ARGUMENT_ERROR;
414        return;
415    }
416
417    if (regexp->fOwnsText && regexp->fText != NULL) {
418        uprv_free((void *)regexp->fText);
419    }
420
421    regexp->fText       = NULL; // only fill it in on request
422    regexp->fTextLength = -1;
423    regexp->fOwnsText   = TRUE;
424    regexp->fMatcher->reset(text);
425}
426
427
428
429//------------------------------------------------------------------------------
430//
431//    uregex_getText
432//
433//------------------------------------------------------------------------------
434U_CAPI const UChar * U_EXPORT2
435uregex_getText(URegularExpression *regexp2,
436               int32_t            *textLength,
437               UErrorCode         *status)  {
438    RegularExpression *regexp = (RegularExpression*)regexp2;
439    if (validateRE(regexp, FALSE, status) == FALSE) {
440        return NULL;
441    }
442
443    if (regexp->fText == NULL) {
444        // need to fill in the text
445        UText *inputText = regexp->fMatcher->inputText();
446        int64_t inputNativeLength = utext_nativeLength(inputText);
447        if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
448            regexp->fText = inputText->chunkContents;
449            regexp->fTextLength = (int32_t)inputNativeLength;
450            regexp->fOwnsText = FALSE; // because the UText owns it
451        } else {
452            UErrorCode lengthStatus = U_ZERO_ERROR;
453            regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
454            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
455
456            utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
457            regexp->fText = inputChars;
458            regexp->fOwnsText = TRUE; // should already be set but just in case
459        }
460    }
461
462    if (textLength != NULL) {
463        *textLength = regexp->fTextLength;
464    }
465    return regexp->fText;
466}
467
468
469//------------------------------------------------------------------------------
470//
471//    uregex_getUText
472//
473//------------------------------------------------------------------------------
474U_CAPI UText * U_EXPORT2
475uregex_getUText(URegularExpression *regexp2,
476                UText              *dest,
477                UErrorCode         *status)  {
478    RegularExpression *regexp = (RegularExpression*)regexp2;
479    if (validateRE(regexp, FALSE, status) == FALSE) {
480        return dest;
481    }
482    return regexp->fMatcher->getInput(dest, *status);
483}
484
485
486//------------------------------------------------------------------------------
487//
488//    uregex_refreshUText
489//
490//------------------------------------------------------------------------------
491U_CAPI void U_EXPORT2
492uregex_refreshUText(URegularExpression *regexp2,
493                    UText              *text,
494                    UErrorCode         *status) {
495    RegularExpression *regexp = (RegularExpression*)regexp2;
496    if (validateRE(regexp, FALSE, status) == FALSE) {
497        return;
498    }
499    regexp->fMatcher->refreshInputText(text, *status);
500}
501
502
503//------------------------------------------------------------------------------
504//
505//    uregex_matches
506//
507//------------------------------------------------------------------------------
508U_CAPI UBool U_EXPORT2
509uregex_matches(URegularExpression *regexp2,
510               int32_t            startIndex,
511               UErrorCode        *status)  {
512    return uregex_matches64( regexp2, (int64_t)startIndex, status);
513}
514
515U_CAPI UBool U_EXPORT2
516uregex_matches64(URegularExpression *regexp2,
517                 int64_t            startIndex,
518                 UErrorCode        *status)  {
519    RegularExpression *regexp = (RegularExpression*)regexp2;
520    UBool result = FALSE;
521    if (validateRE(regexp, TRUE, status) == FALSE) {
522        return result;
523    }
524    if (startIndex == -1) {
525        result = regexp->fMatcher->matches(*status);
526    } else {
527        result = regexp->fMatcher->matches(startIndex, *status);
528    }
529    return result;
530}
531
532
533//------------------------------------------------------------------------------
534//
535//    uregex_lookingAt
536//
537//------------------------------------------------------------------------------
538U_CAPI UBool U_EXPORT2
539uregex_lookingAt(URegularExpression *regexp2,
540                 int32_t             startIndex,
541                 UErrorCode         *status)  {
542    return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
543}
544
545U_CAPI UBool U_EXPORT2
546uregex_lookingAt64(URegularExpression *regexp2,
547                   int64_t             startIndex,
548                   UErrorCode         *status)  {
549    RegularExpression *regexp = (RegularExpression*)regexp2;
550    UBool result = FALSE;
551    if (validateRE(regexp, TRUE, status) == FALSE) {
552        return result;
553    }
554    if (startIndex == -1) {
555        result = regexp->fMatcher->lookingAt(*status);
556    } else {
557        result = regexp->fMatcher->lookingAt(startIndex, *status);
558    }
559    return result;
560}
561
562
563
564//------------------------------------------------------------------------------
565//
566//    uregex_find
567//
568//------------------------------------------------------------------------------
569U_CAPI UBool U_EXPORT2
570uregex_find(URegularExpression *regexp2,
571            int32_t             startIndex,
572            UErrorCode         *status)  {
573    return uregex_find64( regexp2, (int64_t)startIndex, status);
574}
575
576U_CAPI UBool U_EXPORT2
577uregex_find64(URegularExpression *regexp2,
578              int64_t             startIndex,
579              UErrorCode         *status)  {
580    RegularExpression *regexp = (RegularExpression*)regexp2;
581    UBool result = FALSE;
582    if (validateRE(regexp, TRUE, status) == FALSE) {
583        return result;
584    }
585    if (startIndex == -1) {
586        regexp->fMatcher->resetPreserveRegion();
587        result = regexp->fMatcher->find(*status);
588    } else {
589        result = regexp->fMatcher->find(startIndex, *status);
590    }
591    return result;
592}
593
594
595//------------------------------------------------------------------------------
596//
597//    uregex_findNext
598//
599//------------------------------------------------------------------------------
600U_CAPI UBool U_EXPORT2
601uregex_findNext(URegularExpression *regexp2,
602                UErrorCode         *status)  {
603    RegularExpression *regexp = (RegularExpression*)regexp2;
604    if (validateRE(regexp, TRUE, status) == FALSE) {
605        return FALSE;
606    }
607    UBool result = regexp->fMatcher->find(*status);
608    return result;
609}
610
611//------------------------------------------------------------------------------
612//
613//    uregex_groupCount
614//
615//------------------------------------------------------------------------------
616U_CAPI int32_t U_EXPORT2
617uregex_groupCount(URegularExpression *regexp2,
618                  UErrorCode         *status)  {
619    RegularExpression *regexp = (RegularExpression*)regexp2;
620    if (validateRE(regexp, FALSE, status) == FALSE) {
621        return 0;
622    }
623    int32_t  result = regexp->fMatcher->groupCount();
624    return result;
625}
626
627
628//------------------------------------------------------------------------------
629//
630//    uregex_groupNumberFromName
631//
632//------------------------------------------------------------------------------
633int32_t
634uregex_groupNumberFromName(URegularExpression *regexp2,
635                           const UChar        *groupName,
636                           int32_t             nameLength,
637                           UErrorCode          *status) {
638    RegularExpression *regexp = (RegularExpression*)regexp2;
639    if (validateRE(regexp, FALSE, status) == FALSE) {
640        return 0;
641    }
642    int32_t  result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
643    return result;
644}
645
646int32_t
647uregex_groupNumberFromCName(URegularExpression *regexp2,
648                            const char         *groupName,
649                            int32_t             nameLength,
650                            UErrorCode          *status) {
651    RegularExpression *regexp = (RegularExpression*)regexp2;
652    if (validateRE(regexp, FALSE, status) == FALSE) {
653        return 0;
654    }
655    return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
656}
657
658//------------------------------------------------------------------------------
659//
660//    uregex_group
661//
662//------------------------------------------------------------------------------
663U_CAPI int32_t U_EXPORT2
664uregex_group(URegularExpression *regexp2,
665             int32_t             groupNum,
666             UChar              *dest,
667             int32_t             destCapacity,
668             UErrorCode          *status)  {
669    RegularExpression *regexp = (RegularExpression*)regexp2;
670    if (validateRE(regexp, TRUE, status) == FALSE) {
671        return 0;
672    }
673    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
674        *status = U_ILLEGAL_ARGUMENT_ERROR;
675        return 0;
676    }
677
678    if (destCapacity == 0 || regexp->fText != NULL) {
679        // If preflighting or if we already have the text as UChars,
680        // this is a little cheaper than extracting from the UText
681
682        //
683        // Pick up the range of characters from the matcher
684        //
685        int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
686        int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
687        if (U_FAILURE(*status)) {
688            return 0;
689        }
690
691        //
692        // Trim length based on buffer capacity
693        //
694        int32_t fullLength = endIx - startIx;
695        int32_t copyLength = fullLength;
696        if (copyLength < destCapacity) {
697            dest[copyLength] = 0;
698        } else if (copyLength == destCapacity) {
699            *status = U_STRING_NOT_TERMINATED_WARNING;
700        } else {
701            copyLength = destCapacity;
702            *status = U_BUFFER_OVERFLOW_ERROR;
703        }
704
705        //
706        // Copy capture group to user's buffer
707        //
708        if (copyLength > 0) {
709            u_memcpy(dest, &regexp->fText[startIx], copyLength);
710        }
711        return fullLength;
712    } else {
713        int64_t  start = regexp->fMatcher->start64(groupNum, *status);
714        int64_t  limit = regexp->fMatcher->end64(groupNum, *status);
715        if (U_FAILURE(*status)) {
716            return 0;
717        }
718        // Note edge cases:
719        //   Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
720        //   Zero Length Match: start == end.
721        int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
722        return length;
723    }
724
725}
726
727
728//------------------------------------------------------------------------------
729//
730//    uregex_groupUText
731//
732//------------------------------------------------------------------------------
733U_CAPI UText * U_EXPORT2
734uregex_groupUText(URegularExpression *regexp2,
735                  int32_t             groupNum,
736                  UText              *dest,
737                  int64_t            *groupLength,
738                  UErrorCode         *status)  {
739    RegularExpression *regexp = (RegularExpression*)regexp2;
740    if (validateRE(regexp, TRUE, status) == FALSE) {
741        UErrorCode emptyTextStatus = U_ZERO_ERROR;
742        return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
743    }
744
745    return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
746}
747
748//------------------------------------------------------------------------------
749//
750//    uregex_start
751//
752//------------------------------------------------------------------------------
753U_CAPI int32_t U_EXPORT2
754uregex_start(URegularExpression *regexp2,
755             int32_t             groupNum,
756             UErrorCode          *status)  {
757    return (int32_t)uregex_start64( regexp2, groupNum, status);
758}
759
760U_CAPI int64_t U_EXPORT2
761uregex_start64(URegularExpression *regexp2,
762               int32_t             groupNum,
763               UErrorCode          *status)  {
764    RegularExpression *regexp = (RegularExpression*)regexp2;
765    if (validateRE(regexp, TRUE, status) == FALSE) {
766        return 0;
767    }
768    int32_t result = regexp->fMatcher->start(groupNum, *status);
769    return result;
770}
771
772//------------------------------------------------------------------------------
773//
774//    uregex_end
775//
776//------------------------------------------------------------------------------
777U_CAPI int32_t U_EXPORT2
778uregex_end(URegularExpression   *regexp2,
779           int32_t               groupNum,
780           UErrorCode           *status)  {
781    return (int32_t)uregex_end64( regexp2, groupNum, status);
782}
783
784U_CAPI int64_t U_EXPORT2
785uregex_end64(URegularExpression   *regexp2,
786             int32_t               groupNum,
787             UErrorCode           *status)  {
788    RegularExpression *regexp = (RegularExpression*)regexp2;
789    if (validateRE(regexp, TRUE, status) == FALSE) {
790        return 0;
791    }
792    int32_t result = regexp->fMatcher->end(groupNum, *status);
793    return result;
794}
795
796//------------------------------------------------------------------------------
797//
798//    uregex_reset
799//
800//------------------------------------------------------------------------------
801U_CAPI void U_EXPORT2
802uregex_reset(URegularExpression    *regexp2,
803             int32_t               index,
804             UErrorCode            *status)  {
805    uregex_reset64( regexp2, (int64_t)index, status);
806}
807
808U_CAPI void U_EXPORT2
809uregex_reset64(URegularExpression    *regexp2,
810               int64_t               index,
811               UErrorCode            *status)  {
812    RegularExpression *regexp = (RegularExpression*)regexp2;
813    if (validateRE(regexp, TRUE, status) == FALSE) {
814        return;
815    }
816    regexp->fMatcher->reset(index, *status);
817}
818
819
820//------------------------------------------------------------------------------
821//
822//    uregex_setRegion
823//
824//------------------------------------------------------------------------------
825U_CAPI void U_EXPORT2
826uregex_setRegion(URegularExpression   *regexp2,
827                 int32_t               regionStart,
828                 int32_t               regionLimit,
829                 UErrorCode           *status)  {
830    uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
831}
832
833U_CAPI void U_EXPORT2
834uregex_setRegion64(URegularExpression   *regexp2,
835                   int64_t               regionStart,
836                   int64_t               regionLimit,
837                   UErrorCode           *status)  {
838    RegularExpression *regexp = (RegularExpression*)regexp2;
839    if (validateRE(regexp, TRUE, status) == FALSE) {
840        return;
841    }
842    regexp->fMatcher->region(regionStart, regionLimit, *status);
843}
844
845
846//------------------------------------------------------------------------------
847//
848//    uregex_setRegionAndStart
849//
850//------------------------------------------------------------------------------
851U_CAPI void U_EXPORT2
852uregex_setRegionAndStart(URegularExpression   *regexp2,
853                 int64_t               regionStart,
854                 int64_t               regionLimit,
855                 int64_t               startIndex,
856                 UErrorCode           *status)  {
857    RegularExpression *regexp = (RegularExpression*)regexp2;
858    if (validateRE(regexp, TRUE, status) == FALSE) {
859        return;
860    }
861    regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
862}
863
864//------------------------------------------------------------------------------
865//
866//    uregex_regionStart
867//
868//------------------------------------------------------------------------------
869U_CAPI int32_t U_EXPORT2
870uregex_regionStart(const  URegularExpression   *regexp2,
871                          UErrorCode           *status)  {
872    return (int32_t)uregex_regionStart64(regexp2, status);
873}
874
875U_CAPI int64_t U_EXPORT2
876uregex_regionStart64(const  URegularExpression   *regexp2,
877                            UErrorCode           *status)  {
878    RegularExpression *regexp = (RegularExpression*)regexp2;
879    if (validateRE(regexp, TRUE, status) == FALSE) {
880        return 0;
881    }
882    return regexp->fMatcher->regionStart();
883}
884
885
886//------------------------------------------------------------------------------
887//
888//    uregex_regionEnd
889//
890//------------------------------------------------------------------------------
891U_CAPI int32_t U_EXPORT2
892uregex_regionEnd(const  URegularExpression   *regexp2,
893                        UErrorCode           *status)  {
894    return (int32_t)uregex_regionEnd64(regexp2, status);
895}
896
897U_CAPI int64_t U_EXPORT2
898uregex_regionEnd64(const  URegularExpression   *regexp2,
899                          UErrorCode           *status)  {
900    RegularExpression *regexp = (RegularExpression*)regexp2;
901    if (validateRE(regexp, TRUE, status) == FALSE) {
902        return 0;
903    }
904    return regexp->fMatcher->regionEnd();
905}
906
907
908//------------------------------------------------------------------------------
909//
910//    uregex_hasTransparentBounds
911//
912//------------------------------------------------------------------------------
913U_CAPI UBool U_EXPORT2
914uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
915                                   UErrorCode           *status)  {
916    RegularExpression *regexp = (RegularExpression*)regexp2;
917    if (validateRE(regexp, FALSE, status) == FALSE) {
918        return FALSE;
919    }
920    return regexp->fMatcher->hasTransparentBounds();
921}
922
923
924//------------------------------------------------------------------------------
925//
926//    uregex_useTransparentBounds
927//
928//------------------------------------------------------------------------------
929U_CAPI void U_EXPORT2
930uregex_useTransparentBounds(URegularExpression    *regexp2,
931                            UBool                  b,
932                            UErrorCode            *status)  {
933    RegularExpression *regexp = (RegularExpression*)regexp2;
934    if (validateRE(regexp, FALSE, status) == FALSE) {
935        return;
936    }
937    regexp->fMatcher->useTransparentBounds(b);
938}
939
940
941//------------------------------------------------------------------------------
942//
943//    uregex_hasAnchoringBounds
944//
945//------------------------------------------------------------------------------
946U_CAPI UBool U_EXPORT2
947uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
948                                 UErrorCode           *status)  {
949    RegularExpression *regexp = (RegularExpression*)regexp2;
950    if (validateRE(regexp, FALSE, status) == FALSE) {
951        return FALSE;
952    }
953    return regexp->fMatcher->hasAnchoringBounds();
954}
955
956
957//------------------------------------------------------------------------------
958//
959//    uregex_useAnchoringBounds
960//
961//------------------------------------------------------------------------------
962U_CAPI void U_EXPORT2
963uregex_useAnchoringBounds(URegularExpression    *regexp2,
964                          UBool                  b,
965                          UErrorCode            *status)  {
966    RegularExpression *regexp = (RegularExpression*)regexp2;
967    if (validateRE(regexp, FALSE, status) == FALSE) {
968        return;
969    }
970    regexp->fMatcher->useAnchoringBounds(b);
971}
972
973
974//------------------------------------------------------------------------------
975//
976//    uregex_hitEnd
977//
978//------------------------------------------------------------------------------
979U_CAPI UBool U_EXPORT2
980uregex_hitEnd(const  URegularExpression   *regexp2,
981                     UErrorCode           *status)  {
982    RegularExpression *regexp = (RegularExpression*)regexp2;
983    if (validateRE(regexp, TRUE, status) == FALSE) {
984        return FALSE;
985    }
986    return regexp->fMatcher->hitEnd();
987}
988
989
990//------------------------------------------------------------------------------
991//
992//    uregex_requireEnd
993//
994//------------------------------------------------------------------------------
995U_CAPI UBool U_EXPORT2
996uregex_requireEnd(const  URegularExpression   *regexp2,
997                         UErrorCode           *status)  {
998    RegularExpression *regexp = (RegularExpression*)regexp2;
999    if (validateRE(regexp, TRUE, status) == FALSE) {
1000        return FALSE;
1001    }
1002    return regexp->fMatcher->requireEnd();
1003}
1004
1005
1006//------------------------------------------------------------------------------
1007//
1008//    uregex_setTimeLimit
1009//
1010//------------------------------------------------------------------------------
1011U_CAPI void U_EXPORT2
1012uregex_setTimeLimit(URegularExpression   *regexp2,
1013                    int32_t               limit,
1014                    UErrorCode           *status) {
1015    RegularExpression *regexp = (RegularExpression*)regexp2;
1016    if (validateRE(regexp, FALSE, status)) {
1017        regexp->fMatcher->setTimeLimit(limit, *status);
1018    }
1019}
1020
1021
1022
1023//------------------------------------------------------------------------------
1024//
1025//    uregex_getTimeLimit
1026//
1027//------------------------------------------------------------------------------
1028U_CAPI int32_t U_EXPORT2
1029uregex_getTimeLimit(const  URegularExpression   *regexp2,
1030                           UErrorCode           *status) {
1031    int32_t retVal = 0;
1032    RegularExpression *regexp = (RegularExpression*)regexp2;
1033    if (validateRE(regexp, FALSE, status)) {
1034        retVal = regexp->fMatcher->getTimeLimit();
1035    }
1036    return retVal;
1037}
1038
1039
1040
1041//------------------------------------------------------------------------------
1042//
1043//    uregex_setStackLimit
1044//
1045//------------------------------------------------------------------------------
1046U_CAPI void U_EXPORT2
1047uregex_setStackLimit(URegularExpression   *regexp2,
1048                     int32_t               limit,
1049                     UErrorCode           *status) {
1050    RegularExpression *regexp = (RegularExpression*)regexp2;
1051    if (validateRE(regexp, FALSE, status)) {
1052        regexp->fMatcher->setStackLimit(limit, *status);
1053    }
1054}
1055
1056
1057
1058//------------------------------------------------------------------------------
1059//
1060//    uregex_getStackLimit
1061//
1062//------------------------------------------------------------------------------
1063U_CAPI int32_t U_EXPORT2
1064uregex_getStackLimit(const  URegularExpression   *regexp2,
1065                            UErrorCode           *status) {
1066    int32_t retVal = 0;
1067    RegularExpression *regexp = (RegularExpression*)regexp2;
1068    if (validateRE(regexp, FALSE, status)) {
1069        retVal = regexp->fMatcher->getStackLimit();
1070    }
1071    return retVal;
1072}
1073
1074
1075//------------------------------------------------------------------------------
1076//
1077//    uregex_setMatchCallback
1078//
1079//------------------------------------------------------------------------------
1080U_CAPI void U_EXPORT2
1081uregex_setMatchCallback(URegularExpression      *regexp2,
1082                        URegexMatchCallback     *callback,
1083                        const void              *context,
1084                        UErrorCode              *status) {
1085    RegularExpression *regexp = (RegularExpression*)regexp2;
1086    if (validateRE(regexp, FALSE, status)) {
1087        regexp->fMatcher->setMatchCallback(callback, context, *status);
1088    }
1089}
1090
1091
1092//------------------------------------------------------------------------------
1093//
1094//    uregex_getMatchCallback
1095//
1096//------------------------------------------------------------------------------
1097U_CAPI void U_EXPORT2
1098uregex_getMatchCallback(const URegularExpression    *regexp2,
1099                        URegexMatchCallback        **callback,
1100                        const void                 **context,
1101                        UErrorCode                  *status) {
1102    RegularExpression *regexp = (RegularExpression*)regexp2;
1103     if (validateRE(regexp, FALSE, status)) {
1104         regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1105     }
1106}
1107
1108
1109//------------------------------------------------------------------------------
1110//
1111//    uregex_setMatchProgressCallback
1112//
1113//------------------------------------------------------------------------------
1114U_CAPI void U_EXPORT2
1115uregex_setFindProgressCallback(URegularExpression              *regexp2,
1116                                URegexFindProgressCallback      *callback,
1117                                const void                      *context,
1118                                UErrorCode                      *status) {
1119    RegularExpression *regexp = (RegularExpression*)regexp2;
1120    if (validateRE(regexp, FALSE, status)) {
1121        regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1122    }
1123}
1124
1125
1126//------------------------------------------------------------------------------
1127//
1128//    uregex_getMatchCallback
1129//
1130//------------------------------------------------------------------------------
1131U_CAPI void U_EXPORT2
1132uregex_getFindProgressCallback(const URegularExpression          *regexp2,
1133                                URegexFindProgressCallback        **callback,
1134                                const void                        **context,
1135                                UErrorCode                        *status) {
1136    RegularExpression *regexp = (RegularExpression*)regexp2;
1137     if (validateRE(regexp, FALSE, status)) {
1138         regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1139     }
1140}
1141
1142
1143//------------------------------------------------------------------------------
1144//
1145//    uregex_replaceAll
1146//
1147//------------------------------------------------------------------------------
1148U_CAPI int32_t U_EXPORT2
1149uregex_replaceAll(URegularExpression    *regexp2,
1150                  const UChar           *replacementText,
1151                  int32_t                replacementLength,
1152                  UChar                 *destBuf,
1153                  int32_t                destCapacity,
1154                  UErrorCode            *status)  {
1155    RegularExpression *regexp = (RegularExpression*)regexp2;
1156    if (validateRE(regexp, TRUE, status) == FALSE) {
1157        return 0;
1158    }
1159    if (replacementText == NULL || replacementLength < -1 ||
1160        (destBuf == NULL && destCapacity > 0) ||
1161        destCapacity < 0) {
1162        *status = U_ILLEGAL_ARGUMENT_ERROR;
1163        return 0;
1164    }
1165
1166    int32_t   len = 0;
1167
1168    uregex_reset(regexp2, 0, status);
1169
1170    // Note: Seperate error code variables for findNext() and appendReplacement()
1171    //       are used so that destination buffer overflow errors
1172    //       in appendReplacement won't stop findNext() from working.
1173    //       appendReplacement() and appendTail() special case incoming buffer
1174    //       overflow errors, continuing to return the correct length.
1175    UErrorCode  findStatus = *status;
1176    while (uregex_findNext(regexp2, &findStatus)) {
1177        len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1178                                        &destBuf, &destCapacity, status);
1179    }
1180    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1181
1182    if (U_FAILURE(findStatus)) {
1183        // If anything went wrong with the findNext(), make that error trump
1184        //   whatever may have happened with the append() operations.
1185        //   Errors in findNext() are not expected.
1186        *status = findStatus;
1187    }
1188
1189    return len;
1190}
1191
1192
1193//------------------------------------------------------------------------------
1194//
1195//    uregex_replaceAllUText
1196//
1197//------------------------------------------------------------------------------
1198U_CAPI UText * U_EXPORT2
1199uregex_replaceAllUText(URegularExpression    *regexp2,
1200                       UText                 *replacementText,
1201                       UText                 *dest,
1202                       UErrorCode            *status)  {
1203    RegularExpression *regexp = (RegularExpression*)regexp2;
1204    if (validateRE(regexp, TRUE, status) == FALSE) {
1205        return 0;
1206    }
1207    if (replacementText == NULL) {
1208        *status = U_ILLEGAL_ARGUMENT_ERROR;
1209        return 0;
1210    }
1211
1212    dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1213    return dest;
1214}
1215
1216
1217//------------------------------------------------------------------------------
1218//
1219//    uregex_replaceFirst
1220//
1221//------------------------------------------------------------------------------
1222U_CAPI int32_t U_EXPORT2
1223uregex_replaceFirst(URegularExpression  *regexp2,
1224                    const UChar         *replacementText,
1225                    int32_t              replacementLength,
1226                    UChar               *destBuf,
1227                    int32_t              destCapacity,
1228                    UErrorCode          *status)  {
1229    RegularExpression *regexp = (RegularExpression*)regexp2;
1230    if (validateRE(regexp, TRUE, status) == FALSE) {
1231        return 0;
1232    }
1233    if (replacementText == NULL || replacementLength < -1 ||
1234        (destBuf == NULL && destCapacity > 0) ||
1235        destCapacity < 0) {
1236        *status = U_ILLEGAL_ARGUMENT_ERROR;
1237        return 0;
1238    }
1239
1240    int32_t   len = 0;
1241    UBool     findSucceeded;
1242    uregex_reset(regexp2, 0, status);
1243    findSucceeded = uregex_find(regexp2, 0, status);
1244    if (findSucceeded) {
1245        len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1246                                       &destBuf, &destCapacity, status);
1247    }
1248    len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1249
1250    return len;
1251}
1252
1253
1254//------------------------------------------------------------------------------
1255//
1256//    uregex_replaceFirstUText
1257//
1258//------------------------------------------------------------------------------
1259U_CAPI UText * U_EXPORT2
1260uregex_replaceFirstUText(URegularExpression  *regexp2,
1261                         UText                 *replacementText,
1262                         UText                 *dest,
1263                         UErrorCode            *status)  {
1264    RegularExpression *regexp = (RegularExpression*)regexp2;
1265    if (validateRE(regexp, TRUE, status) == FALSE) {
1266        return 0;
1267    }
1268    if (replacementText == NULL) {
1269        *status = U_ILLEGAL_ARGUMENT_ERROR;
1270        return 0;
1271    }
1272
1273    dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1274    return dest;
1275}
1276
1277
1278//------------------------------------------------------------------------------
1279//
1280//    uregex_appendReplacement
1281//
1282//------------------------------------------------------------------------------
1283
1284U_NAMESPACE_BEGIN
1285//
1286//  Dummy class, because these functions need to be friends of class RegexMatcher,
1287//               and stand-alone C functions don't work as friends
1288//
1289class RegexCImpl {
1290 public:
1291   inline static  int32_t appendReplacement(RegularExpression    *regexp,
1292                      const UChar           *replacementText,
1293                      int32_t                replacementLength,
1294                      UChar                **destBuf,
1295                      int32_t               *destCapacity,
1296                      UErrorCode            *status);
1297
1298   inline static int32_t appendTail(RegularExpression    *regexp,
1299        UChar                **destBuf,
1300        int32_t               *destCapacity,
1301        UErrorCode            *status);
1302
1303    inline static int32_t split(RegularExpression    *regexp,
1304        UChar                 *destBuf,
1305        int32_t                destCapacity,
1306        int32_t               *requiredCapacity,
1307        UChar                 *destFields[],
1308        int32_t                destFieldsCapacity,
1309        UErrorCode            *status);
1310};
1311
1312U_NAMESPACE_END
1313
1314
1315
1316static const UChar BACKSLASH  = 0x5c;
1317static const UChar DOLLARSIGN = 0x24;
1318static const UChar LEFTBRACKET = 0x7b;
1319static const UChar RIGHTBRACKET = 0x7d;
1320
1321//
1322//  Move a character to an output buffer, with bounds checking on the index.
1323//      Index advances even if capacity is exceeded, for preflight size computations.
1324//      This little sequence is used a LOT.
1325//
1326static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1327    if (*idx < bufCapacity) {
1328        buf[*idx] = c;
1329    }
1330    (*idx)++;
1331}
1332
1333
1334//
1335//  appendReplacement, the actual implementation.
1336//
1337int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
1338                                      const UChar           *replacementText,
1339                                      int32_t                replacementLength,
1340                                      UChar                **destBuf,
1341                                      int32_t               *destCapacity,
1342                                      UErrorCode            *status)  {
1343
1344    // If we come in with a buffer overflow error, don't suppress the operation.
1345    //  A series of appendReplacements, appendTail need to correctly preflight
1346    //  the buffer size when an overflow happens somewhere in the middle.
1347    UBool pendingBufferOverflow = FALSE;
1348    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1349        pendingBufferOverflow = TRUE;
1350        *status = U_ZERO_ERROR;
1351    }
1352
1353    //
1354    // Validate all paramters
1355    //
1356    if (validateRE(regexp, TRUE, status) == FALSE) {
1357        return 0;
1358    }
1359    if (replacementText == NULL || replacementLength < -1 ||
1360        destCapacity == NULL || destBuf == NULL ||
1361        (*destBuf == NULL && *destCapacity > 0) ||
1362        *destCapacity < 0) {
1363        *status = U_ILLEGAL_ARGUMENT_ERROR;
1364        return 0;
1365    }
1366
1367    RegexMatcher *m = regexp->fMatcher;
1368    if (m->fMatch == FALSE) {
1369        *status = U_REGEX_INVALID_STATE;
1370        return 0;
1371    }
1372
1373    UChar    *dest             = *destBuf;
1374    int32_t   capacity         = *destCapacity;
1375    int32_t   destIdx          =  0;
1376    int32_t   i;
1377
1378    // If it wasn't supplied by the caller,  get the length of the replacement text.
1379    //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
1380    //          the fly and avoid this step.
1381    if (replacementLength == -1) {
1382        replacementLength = u_strlen(replacementText);
1383    }
1384
1385    // Copy input string from the end of previous match to start of current match
1386    if (regexp->fText != NULL) {
1387        int32_t matchStart;
1388        int32_t lastMatchEnd;
1389        if (UTEXT_USES_U16(m->fInputText)) {
1390            lastMatchEnd = (int32_t)m->fLastMatchEnd;
1391            matchStart = (int32_t)m->fMatchStart;
1392        } else {
1393            // !!!: Would like a better way to do this!
1394            UErrorCode tempStatus = U_ZERO_ERROR;
1395            lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
1396            tempStatus = U_ZERO_ERROR;
1397            matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
1398        }
1399        for (i=lastMatchEnd; i<matchStart; i++) {
1400            appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1401        }
1402    } else {
1403        UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1404        destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1405                                 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
1406                                 &possibleOverflowError);
1407    }
1408    U_ASSERT(destIdx >= 0);
1409
1410    // scan the replacement text, looking for substitutions ($n) and \escapes.
1411    int32_t  replIdx = 0;
1412    while (replIdx < replacementLength && U_SUCCESS(*status)) {
1413        UChar  c = replacementText[replIdx];
1414        replIdx++;
1415        if (c != DOLLARSIGN && c != BACKSLASH) {
1416            // Common case, no substitution, no escaping,
1417            //  just copy the char to the dest buf.
1418            appendToBuf(c, &destIdx, dest, capacity);
1419            continue;
1420        }
1421
1422        if (c == BACKSLASH) {
1423            // Backslash Escape.  Copy the following char out without further checks.
1424            //                    Note:  Surrogate pairs don't need any special handling
1425            //                           The second half wont be a '$' or a '\', and
1426            //                           will move to the dest normally on the next
1427            //                           loop iteration.
1428            if (replIdx >= replacementLength) {
1429                break;
1430            }
1431            c = replacementText[replIdx];
1432
1433            if (c==0x55/*U*/ || c==0x75/*u*/) {
1434                // We have a \udddd or \Udddddddd escape sequence.
1435                UChar32 escapedChar =
1436                    u_unescapeAt(uregex_ucstr_unescape_charAt,
1437                       &replIdx,                   // Index is updated by unescapeAt
1438                       replacementLength,          // Length of replacement text
1439                       (void *)replacementText);
1440
1441                if (escapedChar != (UChar32)0xFFFFFFFF) {
1442                    if (escapedChar <= 0xffff) {
1443                        appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1444                    } else {
1445                        appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1446                        appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1447                    }
1448                    continue;
1449                }
1450                // Note:  if the \u escape was invalid, just fall through and
1451                //        treat it as a plain \<anything> escape.
1452            }
1453
1454            // Plain backslash escape.  Just put out the escaped character.
1455            appendToBuf(c, &destIdx, dest, capacity);
1456
1457            replIdx++;
1458            continue;
1459        }
1460
1461        // We've got a $.  Pick up the following capture group name or number.
1462        // For numbers, consume only digits that produce a valid capture group for the pattern.
1463
1464        int32_t groupNum  = 0;
1465        U_ASSERT(c == DOLLARSIGN);
1466        UChar32 c32;
1467        U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1468        if (u_isdigit(c32)) {
1469            int32_t numDigits = 0;
1470            int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
1471            for (;;) {
1472                if (replIdx >= replacementLength) {
1473                    break;
1474                }
1475                U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1476                if (u_isdigit(c32) == FALSE) {
1477                    break;
1478                }
1479
1480                int32_t digitVal = u_charDigitValue(c32);
1481                if (groupNum * 10 + digitVal <= numCaptureGroups) {
1482                    groupNum = groupNum * 10 + digitVal;
1483                    U16_FWD_1(replacementText, replIdx, replacementLength);
1484                    numDigits++;
1485                } else {
1486                    if (numDigits == 0) {
1487                        *status = U_INDEX_OUTOFBOUNDS_ERROR;
1488                    }
1489                    break;
1490                }
1491            }
1492        } else if (c32 == LEFTBRACKET) {
1493            // Scan for Named Capture Group, ${name}.
1494            UnicodeString groupName;
1495            U16_FWD_1(replacementText, replIdx, replacementLength);
1496            while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
1497                if (replIdx >= replacementLength) {
1498                    *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1499                    break;
1500                }
1501                U16_NEXT(replacementText, replIdx, replacementLength, c32);
1502                if ((c32 >= 0x41 && c32 <= 0x5a) ||           // A..Z
1503                        (c32 >= 0x61 && c32 <= 0x7a) ||       // a..z
1504                        (c32 >= 0x31 && c32 <= 0x39)) {       // 0..9
1505                    groupName.append(c32);
1506                } else if (c32 == RIGHTBRACKET) {
1507                    groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
1508                    if (groupNum == 0) {
1509                        // Name not defined by pattern.
1510                        *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1511                    }
1512                } else {
1513                    // Character was something other than a name char or a closing '}'
1514                    *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1515                }
1516            }
1517        } else {
1518            // $ not followed by {name} or digits.
1519            *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1520        }
1521
1522
1523        // Finally, append the capture group data to the destination.
1524        if (U_SUCCESS(*status)) {
1525            destIdx += uregex_group((URegularExpression*)regexp, groupNum,
1526                                    dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1527            if (*status == U_BUFFER_OVERFLOW_ERROR) {
1528                // Ignore buffer overflow when extracting the group.  We need to
1529                //   continue on to get full size of the untruncated result.  We will
1530                //   raise our own buffer overflow error at the end.
1531                *status = U_ZERO_ERROR;
1532            }
1533        }
1534
1535        if (U_FAILURE(*status)) {
1536            // bad group number or name.
1537            break;
1538        }
1539    }
1540
1541    //
1542    //  Nul Terminate the dest buffer if possible.
1543    //  Set the appropriate buffer overflow or not terminated error, if needed.
1544    //
1545    if (destIdx < capacity) {
1546        dest[destIdx] = 0;
1547    } else if (U_SUCCESS(*status)) {
1548        if (destIdx == *destCapacity) {
1549            *status = U_STRING_NOT_TERMINATED_WARNING;
1550        } else {
1551            *status = U_BUFFER_OVERFLOW_ERROR;
1552        }
1553    }
1554
1555    //
1556    // Return an updated dest buffer and capacity to the caller.
1557    //
1558    if (destIdx > 0 &&  *destCapacity > 0) {
1559        if (destIdx < capacity) {
1560            *destBuf      += destIdx;
1561            *destCapacity -= destIdx;
1562        } else {
1563            *destBuf      += capacity;
1564            *destCapacity =  0;
1565        }
1566    }
1567
1568    // If we came in with a buffer overflow, make sure we go out with one also.
1569    //   (A zero length match right at the end of the previous match could
1570    //    make this function succeed even though a previous call had overflowed the buf)
1571    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1572        *status = U_BUFFER_OVERFLOW_ERROR;
1573    }
1574
1575    return destIdx;
1576}
1577
1578//
1579//   appendReplacement   the actual API function,
1580//
1581U_CAPI int32_t U_EXPORT2
1582uregex_appendReplacement(URegularExpression    *regexp2,
1583                         const UChar           *replacementText,
1584                         int32_t                replacementLength,
1585                         UChar                **destBuf,
1586                         int32_t               *destCapacity,
1587                         UErrorCode            *status) {
1588
1589    RegularExpression *regexp = (RegularExpression*)regexp2;
1590    return RegexCImpl::appendReplacement(
1591        regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1592}
1593
1594//
1595//   uregex_appendReplacementUText...can just use the normal C++ method
1596//
1597U_CAPI void U_EXPORT2
1598uregex_appendReplacementUText(URegularExpression    *regexp2,
1599                              UText                 *replText,
1600                              UText                 *dest,
1601                              UErrorCode            *status)  {
1602    RegularExpression *regexp = (RegularExpression*)regexp2;
1603    regexp->fMatcher->appendReplacement(dest, replText, *status);
1604}
1605
1606
1607//------------------------------------------------------------------------------
1608//
1609//    uregex_appendTail
1610//
1611//------------------------------------------------------------------------------
1612int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
1613                               UChar                **destBuf,
1614                               int32_t               *destCapacity,
1615                               UErrorCode            *status)
1616{
1617
1618    // If we come in with a buffer overflow error, don't suppress the operation.
1619    //  A series of appendReplacements, appendTail need to correctly preflight
1620    //  the buffer size when an overflow happens somewhere in the middle.
1621    UBool pendingBufferOverflow = FALSE;
1622    if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1623        pendingBufferOverflow = TRUE;
1624        *status = U_ZERO_ERROR;
1625    }
1626
1627    if (validateRE(regexp, TRUE, status) == FALSE) {
1628        return 0;
1629    }
1630
1631    if (destCapacity == NULL || destBuf == NULL ||
1632        (*destBuf == NULL && *destCapacity > 0) ||
1633        *destCapacity < 0)
1634    {
1635        *status = U_ILLEGAL_ARGUMENT_ERROR;
1636        return 0;
1637    }
1638
1639    RegexMatcher *m = regexp->fMatcher;
1640
1641    int32_t  destIdx     = 0;
1642    int32_t  destCap     = *destCapacity;
1643    UChar    *dest       = *destBuf;
1644
1645    if (regexp->fText != NULL) {
1646        int32_t srcIdx;
1647        int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1648        if (nativeIdx == -1) {
1649            srcIdx = 0;
1650        } else if (UTEXT_USES_U16(m->fInputText)) {
1651            srcIdx = (int32_t)nativeIdx;
1652        } else {
1653            UErrorCode status = U_ZERO_ERROR;
1654            srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1655        }
1656
1657        for (;;) {
1658            U_ASSERT(destIdx >= 0);
1659
1660            if (srcIdx == regexp->fTextLength) {
1661                break;
1662            }
1663            UChar c = regexp->fText[srcIdx];
1664            if (c == 0 && regexp->fTextLength == -1) {
1665                regexp->fTextLength = srcIdx;
1666                break;
1667            }
1668
1669            if (destIdx < destCap) {
1670                dest[destIdx] = c;
1671            } else {
1672                // We've overflowed the dest buffer.
1673                //  If the total input string length is known, we can
1674                //    compute the total buffer size needed without scanning through the string.
1675                if (regexp->fTextLength > 0) {
1676                    destIdx += (regexp->fTextLength - srcIdx);
1677                    break;
1678                }
1679            }
1680            srcIdx++;
1681            destIdx++;
1682        }
1683    } else {
1684        int64_t  srcIdx;
1685        if (m->fMatch) {
1686            // The most recent call to find() succeeded.
1687            srcIdx = m->fMatchEnd;
1688        } else {
1689            // The last call to find() on this matcher failed().
1690            //   Look back to the end of the last find() that succeeded for src index.
1691            srcIdx = m->fLastMatchEnd;
1692            if (srcIdx == -1)  {
1693                // There has been no successful match with this matcher.
1694                //   We want to copy the whole string.
1695                srcIdx = 0;
1696            }
1697        }
1698
1699        destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1700    }
1701
1702    //
1703    //  NUL terminate the output string, if possible, otherwise issue the
1704    //   appropriate error or warning.
1705    //
1706    if (destIdx < destCap) {
1707        dest[destIdx] = 0;
1708    } else  if (destIdx == destCap) {
1709        *status = U_STRING_NOT_TERMINATED_WARNING;
1710    } else {
1711        *status = U_BUFFER_OVERFLOW_ERROR;
1712    }
1713
1714    //
1715    // Update the user's buffer ptr and capacity vars to reflect the
1716    //   amount used.
1717    //
1718    if (destIdx < destCap) {
1719        *destBuf      += destIdx;
1720        *destCapacity -= destIdx;
1721    } else if (*destBuf != NULL) {
1722        *destBuf      += destCap;
1723        *destCapacity  = 0;
1724    }
1725
1726    if (pendingBufferOverflow && U_SUCCESS(*status)) {
1727        *status = U_BUFFER_OVERFLOW_ERROR;
1728    }
1729
1730    return destIdx;
1731}
1732
1733
1734//
1735//   appendTail   the actual API function
1736//
1737U_CAPI int32_t U_EXPORT2
1738uregex_appendTail(URegularExpression    *regexp2,
1739                  UChar                **destBuf,
1740                  int32_t               *destCapacity,
1741                  UErrorCode            *status)  {
1742    RegularExpression *regexp = (RegularExpression*)regexp2;
1743    return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1744}
1745
1746
1747//
1748//   uregex_appendTailUText...can just use the normal C++ method
1749//
1750U_CAPI UText * U_EXPORT2
1751uregex_appendTailUText(URegularExpression    *regexp2,
1752                       UText                 *dest,
1753                       UErrorCode            *status)  {
1754    RegularExpression *regexp = (RegularExpression*)regexp2;
1755    return regexp->fMatcher->appendTail(dest, *status);
1756}
1757
1758
1759//------------------------------------------------------------------------------
1760//
1761//    copyString     Internal utility to copy a string to an output buffer,
1762//                   while managing buffer overflow and preflight size
1763//                   computation.  NUL termination is added to destination,
1764//                   and the NUL is counted in the output size.
1765//
1766//------------------------------------------------------------------------------
1767#if 0
1768static void copyString(UChar        *destBuffer,    //  Destination buffer.
1769                       int32_t       destCapacity,  //  Total capacity of dest buffer
1770                       int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
1771                                                    //    Update not clipped to destCapacity.
1772                       const UChar  *srcPtr,        //  Pointer to source string
1773                       int32_t       srcLen)        //  Source string len.
1774{
1775    int32_t  si;
1776    int32_t  di = *destIndex;
1777    UChar    c;
1778
1779    for (si=0; si<srcLen;  si++) {
1780        c = srcPtr[si];
1781        if (di < destCapacity) {
1782            destBuffer[di] = c;
1783            di++;
1784        } else {
1785            di += srcLen - si;
1786            break;
1787        }
1788    }
1789    if (di<destCapacity) {
1790        destBuffer[di] = 0;
1791    }
1792    di++;
1793    *destIndex = di;
1794}
1795#endif
1796
1797//------------------------------------------------------------------------------
1798//
1799//    uregex_split
1800//
1801//------------------------------------------------------------------------------
1802int32_t RegexCImpl::split(RegularExpression     *regexp,
1803                          UChar                 *destBuf,
1804                          int32_t                destCapacity,
1805                          int32_t               *requiredCapacity,
1806                          UChar                 *destFields[],
1807                          int32_t                destFieldsCapacity,
1808                          UErrorCode            *status) {
1809    //
1810    // Reset for the input text
1811    //
1812    regexp->fMatcher->reset();
1813    UText *inputText = regexp->fMatcher->fInputText;
1814    int64_t   nextOutputStringStart = 0;
1815    int64_t   inputLen = regexp->fMatcher->fInputLength;
1816    if (inputLen == 0) {
1817        return 0;
1818    }
1819
1820    //
1821    // Loop through the input text, searching for the delimiter pattern
1822    //
1823    int32_t   i;             // Index of the field being processed.
1824    int32_t   destIdx = 0;   // Next available position in destBuf;
1825    int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
1826    UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
1827    for (i=0; ; i++) {
1828        if (i>=destFieldsCapacity-1) {
1829            // There are one or zero output strings left.
1830            // Fill the last output string with whatever is left from the input, then exit the loop.
1831            //  ( i will be == destFieldsCapacity if we filled the output array while processing
1832            //    capture groups of the delimiter expression, in which case we will discard the
1833            //    last capture group saved in favor of the unprocessed remainder of the
1834            //    input string.)
1835            if (inputLen > nextOutputStringStart) {
1836                if (i != destFieldsCapacity-1) {
1837                    // No fields are left.  Recycle the last one for holding the trailing part of
1838                    //   the input string.
1839                    i = destFieldsCapacity-1;
1840                    destIdx = (int32_t)(destFields[i] - destFields[0]);
1841                }
1842
1843                destFields[i] = &destBuf[destIdx];
1844                destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1845                                             &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1846            }
1847            break;
1848        }
1849
1850        if (regexp->fMatcher->find()) {
1851            // We found another delimiter.  Move everything from where we started looking
1852            //  up until the start of the delimiter into the next output string.
1853            destFields[i] = &destBuf[destIdx];
1854
1855            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1856                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1857            if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1858                tStatus = U_ZERO_ERROR;
1859            } else {
1860                *status = tStatus;
1861            }
1862            nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1863
1864            // If the delimiter pattern has capturing parentheses, the captured
1865            //  text goes out into the next n destination strings.
1866            int32_t groupNum;
1867            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1868                // If we've run out of output string slots, bail out.
1869                if (i==destFieldsCapacity-1) {
1870                    break;
1871                }
1872                i++;
1873
1874                // Set up to extract the capture group contents into the dest buffer.
1875                destFields[i] = &destBuf[destIdx];
1876                tStatus = U_ZERO_ERROR;
1877                int32_t t = uregex_group((URegularExpression*)regexp,
1878                                         groupNum,
1879                                         destFields[i],
1880                                         REMAINING_CAPACITY(destIdx, destCapacity),
1881                                         &tStatus);
1882                destIdx += t + 1;    // Record the space used in the output string buffer.
1883                                     //  +1 for the NUL that terminates the string.
1884                if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1885                    tStatus = U_ZERO_ERROR;
1886                } else {
1887                    *status = tStatus;
1888                }
1889            }
1890
1891            if (nextOutputStringStart == inputLen) {
1892                // The delimiter was at the end of the string.
1893                // Output an empty string, and then we are done.
1894                if (destIdx < destCapacity) {
1895                    destBuf[destIdx] = 0;
1896                }
1897                if (i < destFieldsCapacity-1) {
1898                   ++i;
1899                }
1900                if (destIdx < destCapacity) {
1901                    destFields[i] = destBuf + destIdx;
1902                }
1903                ++destIdx;
1904                break;
1905            }
1906
1907        }
1908        else
1909        {
1910            // We ran off the end of the input while looking for the next delimiter.
1911            // All the remaining text goes into the current output string.
1912            destFields[i] = &destBuf[destIdx];
1913            destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1914                                         &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1915            break;
1916        }
1917    }
1918
1919    // Zero out any unused portion of the destFields array
1920    int j;
1921    for (j=i+1; j<destFieldsCapacity; j++) {
1922        destFields[j] = NULL;
1923    }
1924
1925    if (requiredCapacity != NULL) {
1926        *requiredCapacity = destIdx;
1927    }
1928    if (destIdx > destCapacity) {
1929        *status = U_BUFFER_OVERFLOW_ERROR;
1930    }
1931    return i+1;
1932}
1933
1934//
1935//   uregex_split   The actual API function
1936//
1937U_CAPI int32_t U_EXPORT2
1938uregex_split(URegularExpression      *regexp2,
1939             UChar                   *destBuf,
1940             int32_t                  destCapacity,
1941             int32_t                 *requiredCapacity,
1942             UChar                   *destFields[],
1943             int32_t                  destFieldsCapacity,
1944             UErrorCode              *status) {
1945    RegularExpression *regexp = (RegularExpression*)regexp2;
1946    if (validateRE(regexp, TRUE, status) == FALSE) {
1947        return 0;
1948    }
1949    if ((destBuf == NULL && destCapacity > 0) ||
1950        destCapacity < 0 ||
1951        destFields == NULL ||
1952        destFieldsCapacity < 1 ) {
1953        *status = U_ILLEGAL_ARGUMENT_ERROR;
1954        return 0;
1955    }
1956
1957    return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1958}
1959
1960
1961//
1962//   uregex_splitUText...can just use the normal C++ method
1963//
1964U_CAPI int32_t U_EXPORT2
1965uregex_splitUText(URegularExpression    *regexp2,
1966                  UText                 *destFields[],
1967                  int32_t                destFieldsCapacity,
1968                  UErrorCode            *status) {
1969    RegularExpression *regexp = (RegularExpression*)regexp2;
1970    return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1971}
1972
1973
1974#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1975
1976