1/*
2*******************************************************************************
3*
4*   Copyright (C) 2005-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  utext.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2005apr12
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/ustring.h"
19#include "unicode/unistr.h"
20#include "unicode/chariter.h"
21#include "unicode/utext.h"
22#include "ustr_imp.h"
23#include "cmemory.h"
24#include "cstring.h"
25#include "uassert.h"
26#include "putilimp.h"
27
28U_NAMESPACE_USE
29
30#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
31
32
33static UBool
34utext_access(UText *ut, int64_t index, UBool forward) {
35    return ut->pFuncs->access(ut, index, forward);
36}
37
38
39
40U_CAPI UBool U_EXPORT2
41utext_moveIndex32(UText *ut, int32_t delta) {
42    UChar32  c;
43    if (delta > 0) {
44        do {
45            if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
46                return FALSE;
47            }
48            c = ut->chunkContents[ut->chunkOffset];
49            if (U16_IS_SURROGATE(c)) {
50                c = utext_next32(ut);
51                if (c == U_SENTINEL) {
52                    return FALSE;
53                }
54            } else {
55                ut->chunkOffset++;
56            }
57        } while(--delta>0);
58
59    } else if (delta<0) {
60        do {
61            if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
62                return FALSE;
63            }
64            c = ut->chunkContents[ut->chunkOffset-1];
65            if (U16_IS_SURROGATE(c)) {
66                c = utext_previous32(ut);
67                if (c == U_SENTINEL) {
68                    return FALSE;
69                }
70            } else {
71                ut->chunkOffset--;
72            }
73        } while(++delta<0);
74    }
75
76    return TRUE;
77}
78
79
80U_CAPI int64_t U_EXPORT2
81utext_nativeLength(UText *ut) {
82    return ut->pFuncs->nativeLength(ut);
83}
84
85
86U_CAPI UBool U_EXPORT2
87utext_isLengthExpensive(const UText *ut) {
88    UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
89    return r;
90}
91
92
93U_CAPI int64_t U_EXPORT2
94utext_getNativeIndex(const UText *ut) {
95    if(ut->chunkOffset <= ut->nativeIndexingLimit) {
96        return ut->chunkNativeStart+ut->chunkOffset;
97    } else {
98        return ut->pFuncs->mapOffsetToNative(ut);
99    }
100}
101
102
103U_CAPI void U_EXPORT2
104utext_setNativeIndex(UText *ut, int64_t index) {
105    if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
106        // The desired position is outside of the current chunk.
107        // Access the new position.  Assume a forward iteration from here,
108        // which will also be optimimum for a single random access.
109        // Reverse iterations may suffer slightly.
110        ut->pFuncs->access(ut, index, TRUE);
111    } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
112        // utf-16 indexing.
113        ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
114    } else {
115         ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
116    }
117    // The convention is that the index must always be on a code point boundary.
118    // Adjust the index position if it is in the middle of a surrogate pair.
119    if (ut->chunkOffset<ut->chunkLength) {
120        UChar c= ut->chunkContents[ut->chunkOffset];
121        if (UTF16_IS_TRAIL(c)) {
122            if (ut->chunkOffset==0) {
123                ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
124            }
125            if (ut->chunkOffset>0) {
126                UChar lead = ut->chunkContents[ut->chunkOffset-1];
127                if (UTF16_IS_LEAD(lead)) {
128                    ut->chunkOffset--;
129                }
130            }
131        }
132    }
133}
134
135
136
137U_CAPI int64_t U_EXPORT2
138utext_getPreviousNativeIndex(UText *ut) {
139    //
140    //  Fast-path the common case.
141    //     Common means current position is not at the beginning of a chunk
142    //     and the preceding character is not supplementary.
143    //
144    int32_t i = ut->chunkOffset - 1;
145    int64_t result;
146    if (i >= 0) {
147        UChar c = ut->chunkContents[i];
148        if (U16_IS_TRAIL(c) == FALSE) {
149            if (i <= ut->nativeIndexingLimit) {
150                result = ut->chunkNativeStart + i;
151            } else {
152                ut->chunkOffset = i;
153                result = ut->pFuncs->mapOffsetToNative(ut);
154                ut->chunkOffset++;
155            }
156            return result;
157        }
158    }
159
160    // If at the start of text, simply return 0.
161    if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
162        return 0;
163    }
164
165    // Harder, less common cases.  We are at a chunk boundary, or on a surrogate.
166    //    Keep it simple, use other functions to handle the edges.
167    //
168    utext_previous32(ut);
169    result = UTEXT_GETNATIVEINDEX(ut);
170    utext_next32(ut);
171    return result;
172}
173
174
175//
176//  utext_current32.  Get the UChar32 at the current position.
177//                    UText iteration position is always on a code point boundary,
178//                    never on the trail half of a surrogate pair.
179//
180U_CAPI UChar32 U_EXPORT2
181utext_current32(UText *ut) {
182    UChar32  c;
183    if (ut->chunkOffset==ut->chunkLength) {
184        // Current position is just off the end of the chunk.
185        if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
186            // Off the end of the text.
187            return U_SENTINEL;
188        }
189    }
190
191    c = ut->chunkContents[ut->chunkOffset];
192    if (U16_IS_LEAD(c) == FALSE) {
193        // Normal, non-supplementary case.
194        return c;
195    }
196
197    //
198    //  Possible supplementary char.
199    //
200    UChar32   trail = 0;
201    UChar32   supplementaryC = c;
202    if ((ut->chunkOffset+1) < ut->chunkLength) {
203        // The trail surrogate is in the same chunk.
204        trail = ut->chunkContents[ut->chunkOffset+1];
205    } else {
206        //  The trail surrogate is in a different chunk.
207        //     Because we must maintain the iteration position, we need to switch forward
208        //     into the new chunk, get the trail surrogate, then revert the chunk back to the
209        //     original one.
210        //     An edge case to be careful of:  the entire text may end with an unpaired
211        //        leading surrogate.  The attempt to access the trail will fail, but
212        //        the original position before the unpaired lead still needs to be restored.
213        int64_t  nativePosition = ut->chunkNativeLimit;
214        int32_t  originalOffset = ut->chunkOffset;
215        if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
216            trail = ut->chunkContents[ut->chunkOffset];
217        }
218        UBool r = ut->pFuncs->access(ut, nativePosition, FALSE);  // reverse iteration flag loads preceding chunk
219        U_ASSERT(r==TRUE);
220        ut->chunkOffset = originalOffset;
221        if(!r) {
222            return U_SENTINEL;
223        }
224    }
225
226    if (U16_IS_TRAIL(trail)) {
227        supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
228    }
229    return supplementaryC;
230
231}
232
233
234U_CAPI UChar32 U_EXPORT2
235utext_char32At(UText *ut, int64_t nativeIndex) {
236    UChar32 c = U_SENTINEL;
237
238    // Fast path the common case.
239    if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
240        ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
241        c = ut->chunkContents[ut->chunkOffset];
242        if (U16_IS_SURROGATE(c) == FALSE) {
243            return c;
244        }
245    }
246
247
248    utext_setNativeIndex(ut, nativeIndex);
249    if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
250        c = ut->chunkContents[ut->chunkOffset];
251        if (U16_IS_SURROGATE(c)) {
252            // For surrogates, let current32() deal with the complications
253            //    of supplementaries that may span chunk boundaries.
254            c = utext_current32(ut);
255        }
256    }
257    return c;
258}
259
260
261U_CAPI UChar32 U_EXPORT2
262utext_next32(UText *ut) {
263    UChar32       c;
264
265    if (ut->chunkOffset >= ut->chunkLength) {
266        if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
267            return U_SENTINEL;
268        }
269    }
270
271    c = ut->chunkContents[ut->chunkOffset++];
272    if (U16_IS_LEAD(c) == FALSE) {
273        // Normal case, not supplementary.
274        //   (A trail surrogate seen here is just returned as is, as a surrogate value.
275        //    It cannot be part of a pair.)
276        return c;
277    }
278
279    if (ut->chunkOffset >= ut->chunkLength) {
280        if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
281            // c is an unpaired lead surrogate at the end of the text.
282            // return it as it is.
283            return c;
284        }
285    }
286    UChar32 trail = ut->chunkContents[ut->chunkOffset];
287    if (U16_IS_TRAIL(trail) == FALSE) {
288        // c was an unpaired lead surrogate, not at the end of the text.
289        // return it as it is (unpaired).  Iteration position is on the
290        // following character, possibly in the next chunk, where the
291        //  trail surrogate would have been if it had existed.
292        return c;
293    }
294
295    UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
296    ut->chunkOffset++;   // move iteration position over the trail surrogate.
297    return supplementary;
298    }
299
300
301U_CAPI UChar32 U_EXPORT2
302utext_previous32(UText *ut) {
303    UChar32       c;
304
305    if (ut->chunkOffset <= 0) {
306        if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
307            return U_SENTINEL;
308        }
309    }
310    ut->chunkOffset--;
311    c = ut->chunkContents[ut->chunkOffset];
312    if (U16_IS_TRAIL(c) == FALSE) {
313        // Normal case, not supplementary.
314        //   (A lead surrogate seen here is just returned as is, as a surrogate value.
315        //    It cannot be part of a pair.)
316        return c;
317    }
318
319    if (ut->chunkOffset <= 0) {
320        if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
321            // c is an unpaired trail surrogate at the start of the text.
322            // return it as it is.
323            return c;
324        }
325    }
326
327    UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
328    if (U16_IS_LEAD(lead) == FALSE) {
329        // c was an unpaired trail surrogate, not at the end of the text.
330        // return it as it is (unpaired).  Iteration position is at c
331        return c;
332    }
333
334    UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
335    ut->chunkOffset--;   // move iteration position over the lead surrogate.
336    return supplementary;
337}
338
339
340
341U_CAPI UChar32 U_EXPORT2
342utext_next32From(UText *ut, int64_t index) {
343    UChar32       c      = U_SENTINEL;
344
345    if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
346        // Desired position is outside of the current chunk.
347        if(!ut->pFuncs->access(ut, index, TRUE)) {
348            // no chunk available here
349            return U_SENTINEL;
350        }
351    } else if (index - ut->chunkNativeStart  <= (int64_t)ut->nativeIndexingLimit) {
352        // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
353        ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
354    } else {
355        // Desired position is in chunk, with non-UTF16 indexing.
356        ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
357    }
358
359    c = ut->chunkContents[ut->chunkOffset++];
360    if (U16_IS_SURROGATE(c)) {
361        // Surrogates.  Many edge cases.  Use other functions that already
362        //              deal with the problems.
363        utext_setNativeIndex(ut, index);
364        c = utext_next32(ut);
365    }
366    return c;
367}
368
369
370U_CAPI UChar32 U_EXPORT2
371utext_previous32From(UText *ut, int64_t index) {
372    //
373    //  Return the character preceding the specified index.
374    //  Leave the iteration position at the start of the character that was returned.
375    //
376    UChar32     cPrev;    // The character preceding cCurr, which is what we will return.
377
378    // Address the chunk containg the position preceding the incoming index
379    // A tricky edge case:
380    //   We try to test the requested native index against the chunkNativeStart to determine
381    //    whether the character preceding the one at the index is in the current chunk.
382    //    BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
383    //    requested index is on something other than the first position of the first char.
384    //
385    if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
386        // Requested native index is outside of the current chunk.
387        if(!ut->pFuncs->access(ut, index, FALSE)) {
388            // no chunk available here
389            return U_SENTINEL;
390        }
391    } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
392        // Direct UTF-16 indexing.
393        ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
394    } else {
395        ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
396        if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
397            // no chunk available here
398            return U_SENTINEL;
399        }
400    }
401
402    //
403    // Simple case with no surrogates.
404    //
405    ut->chunkOffset--;
406    cPrev = ut->chunkContents[ut->chunkOffset];
407
408    if (U16_IS_SURROGATE(cPrev)) {
409        // Possible supplementary.  Many edge cases.
410        // Let other functions do the heavy lifting.
411        utext_setNativeIndex(ut, index);
412        cPrev = utext_previous32(ut);
413    }
414    return cPrev;
415}
416
417
418U_CAPI int32_t U_EXPORT2
419utext_extract(UText *ut,
420             int64_t start, int64_t limit,
421             UChar *dest, int32_t destCapacity,
422             UErrorCode *status) {
423                 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
424             }
425
426
427
428U_CAPI UBool U_EXPORT2
429utext_equals(const UText *a, const UText *b) {
430    if (a==NULL || b==NULL ||
431        a->magic != UTEXT_MAGIC ||
432        b->magic != UTEXT_MAGIC) {
433            // Null or invalid arguments don't compare equal to anything.
434            return FALSE;
435    }
436
437    if (a->pFuncs != b->pFuncs) {
438        // Different types of text providers.
439        return FALSE;
440    }
441
442    if (a->context != b->context) {
443        // Different sources (different strings)
444        return FALSE;
445    }
446    if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
447        // Different current position in the string.
448        return FALSE;
449    }
450
451    return TRUE;
452}
453
454U_CAPI int32_t U_EXPORT2
455utext_compare(UText *s1, int32_t length1,
456              UText *s2, int32_t length2) {
457    UChar32 c1 = 0, c2 = 0;
458
459    if(length1<0 && length2<0) {
460        /* strcmp style, go until end of string */
461        for(;;) {
462            c1 = UTEXT_NEXT32(s1);
463            c2 = UTEXT_NEXT32(s2);
464            if(c1 != c2) {
465                break;
466            } else if(c1 == U_SENTINEL) {
467                return 0;
468            }
469        }
470    } else {
471        if(length1 < 0) {
472            length1 = INT32_MIN;
473        } else if (length2 < 0) {
474            length2 = INT32_MIN;
475        }
476
477        /* memcmp/UnicodeString style, both length-specified */
478        while((length1 > 0 || length1 == INT32_MIN) && (length2 > 0 || length2 == INT32_MIN)) {
479            c1 = UTEXT_NEXT32(s1);
480            c2 = UTEXT_NEXT32(s2);
481
482            if(c1 != c2) {
483                break;
484            } else if(c1 == U_SENTINEL) {
485                return 0;
486            }
487
488            if (length1 != INT32_MIN) {
489                length1 -= 1;
490            }
491            if (length2 != INT32_MIN) {
492                length2 -= 1;
493            }
494        }
495
496        if(length1 <= 0 && length1 != INT32_MIN) {
497            if(length2 <= 0) {
498                return 0;
499            } else {
500                return -1;
501            }
502        } else if(length2 <= 0 && length2 != INT32_MIN) {
503            if (length1 <= 0) {
504                return 0;
505            } else {
506                return 1;
507            }
508        }
509    }
510
511    return (int32_t)c1-(int32_t)c2;
512}
513
514U_CAPI int32_t U_EXPORT2
515utext_compareNativeLimit(UText *s1, int64_t limit1,
516                         UText *s2, int64_t limit2) {
517    UChar32 c1, c2;
518
519    if(limit1<0 && limit2<0) {
520        /* strcmp style, go until end of string */
521        for(;;) {
522            c1 = UTEXT_NEXT32(s1);
523            c2 = UTEXT_NEXT32(s2);
524            if(c1 != c2) {
525                return (int32_t)c1-(int32_t)c2;
526            } else if(c1 == U_SENTINEL) {
527                return 0;
528            }
529        }
530    } else {
531        /* memcmp/UnicodeString style, both length-specified */
532        int64_t index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
533        int64_t index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
534
535        while((limit1 < 0 || index1 < limit1) && (limit2 < 0 || index2 < limit2)) {
536            c1 = UTEXT_NEXT32(s1);
537            c2 = UTEXT_NEXT32(s2);
538
539            if(c1 != c2) {
540                return (int32_t)c1-(int32_t)c2;
541            } else if(c1 == U_SENTINEL) {
542                return 0;
543            }
544
545            if (limit1 >= 0) {
546                index1 = UTEXT_GETNATIVEINDEX(s1);
547            }
548            if (limit2 >= 0) {
549                index2 = UTEXT_GETNATIVEINDEX(s2);
550            }
551        }
552
553        if(limit1 >= 0 && index1 >= limit1) {
554            if(index2 >= limit2) {
555                return 0;
556            } else {
557                return -1;
558            }
559        } else {
560            if(index1 >= limit1) {
561                return 0;
562            } else {
563                return 1;
564            }
565        }
566    }
567}
568
569U_CAPI int32_t U_EXPORT2
570utext_caseCompare(UText *s1, int32_t length1,
571                     UText *s2, int32_t length2,
572                     uint32_t options, UErrorCode *pErrorCode) {
573    const UCaseProps *csp;
574
575    /* case folding variables */
576    const UChar *p;
577    int32_t length;
578
579    /* case folding buffers, only use current-level start/limit */
580    UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
581    int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
582
583    /* current code points */
584    UChar32 c1, c2;
585    uint8_t cLength1, cLength2;
586
587    /* argument checking */
588    if(U_FAILURE(*pErrorCode)) {
589        return 0;
590    }
591    if(s1==NULL || s2==NULL) {
592        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
593        return 0;
594    }
595
596    csp=ucase_getSingleton();
597
598    /* for variable-length strings */
599    if(length1 < 0) {
600        length1 = INT32_MIN;
601    }
602    if (length2 < 0) {
603        length2 = INT32_MIN;
604    }
605
606    /* initialize */
607    foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
608
609    /* comparison loop */
610    while((foldOffset1 < foldLength1 || length1 > 0 || length1 == INT32_MIN) &&
611          (foldOffset2 < foldLength2 || length2 > 0 || length2 == INT32_MIN)) {
612        if(foldOffset1 < foldLength1) {
613            U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
614            cLength1 = 0;
615        } else {
616            c1 = UTEXT_NEXT32(s1);
617            if (c1 != U_SENTINEL) {
618                cLength1 = U16_LENGTH(c1);
619
620                length = ucase_toFullFolding(csp, c1, &p, options);
621                if(length >= 0) {
622                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
623                        u_memcpy(fold1, p, length);
624                        foldOffset1 = 0;
625                        foldLength1 = length;
626                        U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
627                    } else {
628                        c1 = length;
629                    }
630                }
631            }
632
633            if(length1 != INT32_MIN) {
634                length1 -= 1;
635            }
636        }
637
638        if(foldOffset2 < foldLength2) {
639            U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
640            cLength2 = 0;
641        } else {
642            c2 = UTEXT_NEXT32(s2);
643            if (c2 != U_SENTINEL) {
644                cLength2 = U16_LENGTH(c2);
645
646                length = ucase_toFullFolding(csp, c2, &p, options);
647                if(length >= 0) {
648                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
649                        u_memcpy(fold2, p, length);
650                        foldOffset2 = 0;
651                        foldLength2 = length;
652                        U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
653                    } else {
654                        c2 = length;
655                    }
656                }
657            } else if(c1 == U_SENTINEL) {
658                return 0; // end of both strings at once
659            }
660
661            if(length2 != INT32_MIN) {
662                length2 -= 1;
663            }
664        }
665
666        if(c1 != c2) {
667            return (int32_t)c1-(int32_t)c2;
668        }
669    }
670
671    /* By now at least one of the strings is out of characters */
672    length1 += foldLength1 - foldOffset1;
673    length2 += foldLength2 - foldOffset2;
674
675    if(length1 <= 0 && length1 != INT32_MIN) {
676        if(length2 <= 0) {
677            return 0;
678        } else {
679            return -1;
680        }
681    } else {
682        if (length1 <= 0) {
683            return 0;
684        } else {
685            return 1;
686        }
687    }
688}
689
690U_CAPI int32_t U_EXPORT2
691utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
692                                UText *s2, int64_t limit2,
693                                uint32_t options, UErrorCode *pErrorCode) {
694    const UCaseProps *csp;
695
696    /* case folding variables */
697    const UChar *p;
698    int32_t length;
699
700    /* case folding buffers, only use current-level start/limit */
701    UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
702    int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
703
704    /* current code points */
705    UChar32 c1, c2;
706
707    /* native indexes into s1 and s2 */
708    int64_t index1, index2;
709
710    /* argument checking */
711    if(U_FAILURE(*pErrorCode)) {
712        return 0;
713    }
714    if(s1==NULL || s2==NULL) {
715        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
716        return 0;
717    }
718
719    csp=ucase_getSingleton();
720
721    /* initialize */
722    index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
723    index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
724
725    foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
726
727    /* comparison loop */
728    while((foldOffset1 < foldLength1 || limit1 < 0 || index1 < limit1) &&
729          (foldOffset2 < foldLength2 || limit2 < 0 || index2 < limit2)) {
730        if(foldOffset1 < foldLength1) {
731            U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
732        } else {
733            c1 = UTEXT_NEXT32(s1);
734            if (c1 != U_SENTINEL) {
735                length = ucase_toFullFolding(csp, c1, &p, options);
736                if(length >= 0) {
737                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
738                        u_memcpy(fold1, p, length);
739                        foldOffset1 = 0;
740                        foldLength1 = length;
741                        U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
742                    } else {
743                        c1 = length;
744                    }
745                }
746            }
747
748            if (limit1 >= 0) {
749                index1 = UTEXT_GETNATIVEINDEX(s1);
750            }
751        }
752
753        if(foldOffset2 < foldLength2) {
754            U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
755        } else {
756            c2 = UTEXT_NEXT32(s2);
757            if (c2 != U_SENTINEL) {
758                length = ucase_toFullFolding(csp, c2, &p, options);
759                if(length >= 0) {
760                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
761                        u_memcpy(fold2, p, length);
762                        foldOffset2 = 0;
763                        foldLength2 = length;
764                        U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
765                    } else {
766                        c2 = length;
767                    }
768                }
769            } else if(c1 == U_SENTINEL) {
770                return 0;
771            }
772
773            if (limit2 >= 0) {
774                index2 = UTEXT_GETNATIVEINDEX(s2);
775            }
776        }
777
778        if(c1 != c2) {
779            return (int32_t)c1-(int32_t)c2;
780        }
781    }
782
783    /* By now at least one of the strings is out of characters */
784    index1 -= foldLength1 - foldOffset1;
785    index2 -= foldLength2 - foldOffset2;
786
787    if(limit1 >= 0 && index1 >= limit1) {
788        if(index2 >= limit2) {
789            return 0;
790        } else {
791            return -1;
792        }
793    } else {
794        if(index1 >= limit1) {
795            return 0;
796        } else {
797            return 1;
798        }
799    }
800}
801
802
803U_CAPI UBool U_EXPORT2
804utext_isWritable(const UText *ut)
805{
806    UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
807    return b;
808}
809
810
811U_CAPI void U_EXPORT2
812utext_freeze(UText *ut) {
813    // Zero out the WRITABLE flag.
814    ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
815}
816
817
818U_CAPI UBool U_EXPORT2
819utext_hasMetaData(const UText *ut)
820{
821    UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
822    return b;
823}
824
825
826
827U_CAPI int32_t U_EXPORT2
828utext_replace(UText *ut,
829             int64_t nativeStart, int64_t nativeLimit,
830             const UChar *replacementText, int32_t replacementLength,
831             UErrorCode *status)
832{
833    if (U_FAILURE(*status)) {
834        return 0;
835    }
836    if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
837        *status = U_NO_WRITE_PERMISSION;
838        return 0;
839    }
840    int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
841    return i;
842}
843
844U_CAPI void U_EXPORT2
845utext_copy(UText *ut,
846          int64_t nativeStart, int64_t nativeLimit,
847          int64_t destIndex,
848          UBool move,
849          UErrorCode *status)
850{
851    if (U_FAILURE(*status)) {
852        return;
853    }
854    if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
855        *status = U_NO_WRITE_PERMISSION;
856        return;
857    }
858    ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
859}
860
861
862
863U_CAPI UText * U_EXPORT2
864utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
865    UText *result;
866    result = src->pFuncs->clone(dest, src, deep, status);
867    if (readOnly) {
868        utext_freeze(result);
869    }
870    return result;
871}
872
873
874
875//------------------------------------------------------------------------------
876//
877//   UText common functions implementation
878//
879//------------------------------------------------------------------------------
880
881//
882//  UText.flags bit definitions
883//
884enum {
885    UTEXT_HEAP_ALLOCATED  = 1,      //  1 if ICU has allocated this UText struct on the heap.
886                                    //  0 if caller provided storage for the UText.
887
888    UTEXT_EXTRA_HEAP_ALLOCATED = 2, //  1 if ICU has allocated extra storage as a separate
889                                    //     heap block.
890                                    //  0 if there is no separate allocation.  Either no extra
891                                    //     storage was requested, or it is appended to the end
892                                    //     of the main UText storage.
893
894    UTEXT_OPEN = 4                  //  1 if this UText is currently open
895                                    //  0 if this UText is not open.
896};
897
898
899//
900//  Extended form of a UText.  The purpose is to aid in computing the total size required
901//    when a provider asks for a UText to be allocated with extra storage.
902
903struct ExtendedUText {
904    UText          ut;
905    UAlignedMemory extension;
906};
907
908static const UText emptyText = UTEXT_INITIALIZER;
909
910U_CAPI UText * U_EXPORT2
911utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
912    if (U_FAILURE(*status)) {
913        return ut;
914    }
915
916    if (ut == NULL) {
917        // We need to heap-allocate storage for the new UText
918        int32_t spaceRequired = sizeof(UText);
919        if (extraSpace > 0) {
920            spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
921        }
922        ut = (UText *)uprv_malloc(spaceRequired);
923        if (ut == NULL) {
924            *status = U_MEMORY_ALLOCATION_ERROR;
925            return NULL;
926        } else {
927            *ut = emptyText;
928            ut->flags |= UTEXT_HEAP_ALLOCATED;
929            if (spaceRequired>0) {
930                ut->extraSize = extraSpace;
931                ut->pExtra    = &((ExtendedUText *)ut)->extension;
932            }
933        }
934    } else {
935        // We have been supplied with an already existing UText.
936        // Verify that it really appears to be a UText.
937        if (ut->magic != UTEXT_MAGIC) {
938            *status = U_ILLEGAL_ARGUMENT_ERROR;
939            return ut;
940        }
941        // If the ut is already open and there's a provider supplied close
942        //   function, call it.
943        if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL)  {
944            ut->pFuncs->close(ut);
945        }
946        ut->flags &= ~UTEXT_OPEN;
947
948        // If extra space was requested by our caller, check whether
949        //   sufficient already exists, and allocate new if needed.
950        if (extraSpace > ut->extraSize) {
951            // Need more space.  If there is existing separately allocated space,
952            //   delete it first, then allocate new space.
953            if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
954                uprv_free(ut->pExtra);
955                ut->extraSize = 0;
956            }
957            ut->pExtra = uprv_malloc(extraSpace);
958            if (ut->pExtra == NULL) {
959                *status = U_MEMORY_ALLOCATION_ERROR;
960            } else {
961                ut->extraSize = extraSpace;
962                ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
963            }
964        }
965    }
966    if (U_SUCCESS(*status)) {
967        ut->flags |= UTEXT_OPEN;
968
969        // Initialize all remaining fields of the UText.
970        //
971        ut->context             = NULL;
972        ut->chunkContents       = NULL;
973        ut->p                   = NULL;
974        ut->q                   = NULL;
975        ut->r                   = NULL;
976        ut->a                   = 0;
977        ut->b                   = 0;
978        ut->c                   = 0;
979        ut->chunkOffset         = 0;
980        ut->chunkLength         = 0;
981        ut->chunkNativeStart    = 0;
982        ut->chunkNativeLimit    = 0;
983        ut->nativeIndexingLimit = 0;
984        ut->providerProperties  = 0;
985        ut->privA               = 0;
986        ut->privB               = 0;
987        ut->privC               = 0;
988        ut->privP               = NULL;
989        if (ut->pExtra!=NULL && ut->extraSize>0)
990            uprv_memset(ut->pExtra, 0, ut->extraSize);
991
992    }
993    return ut;
994}
995
996
997U_CAPI UText * U_EXPORT2
998utext_close(UText *ut) {
999    if (ut==NULL ||
1000        ut->magic != UTEXT_MAGIC ||
1001        (ut->flags & UTEXT_OPEN) == 0)
1002    {
1003        // The supplied ut is not an open UText.
1004        // Do nothing.
1005        return ut;
1006    }
1007
1008    // If the provider gave us a close function, call it now.
1009    // This will clean up anything allocated specifically by the provider.
1010    if (ut->pFuncs->close != NULL) {
1011        ut->pFuncs->close(ut);
1012    }
1013    ut->flags &= ~UTEXT_OPEN;
1014
1015    // If we (the framework) allocated the UText or subsidiary storage,
1016    //   delete it.
1017    if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
1018        uprv_free(ut->pExtra);
1019        ut->pExtra = NULL;
1020        ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
1021        ut->extraSize = 0;
1022    }
1023
1024    // Zero out function table of the closed UText.  This is a defensive move,
1025    //   inteded to cause applications that inadvertantly use a closed
1026    //   utext to crash with null pointer errors.
1027    ut->pFuncs        = NULL;
1028
1029    if (ut->flags & UTEXT_HEAP_ALLOCATED) {
1030        // This UText was allocated by UText setup.  We need to free it.
1031        // Clear magic, so we can detect if the user messes up and immediately
1032        //  tries to reopen another UText using the deleted storage.
1033        ut->magic = 0;
1034        uprv_free(ut);
1035        ut = NULL;
1036    }
1037    return ut;
1038}
1039
1040
1041
1042
1043//
1044// invalidateChunk   Reset a chunk to have no contents, so that the next call
1045//                   to access will cause new data to load.
1046//                   This is needed when copy/move/replace operate directly on the
1047//                   backing text, potentially putting it out of sync with the
1048//                   contents in the chunk.
1049//
1050static void
1051invalidateChunk(UText *ut) {
1052    ut->chunkLength = 0;
1053    ut->chunkNativeLimit = 0;
1054    ut->chunkNativeStart = 0;
1055    ut->chunkOffset = 0;
1056    ut->nativeIndexingLimit = 0;
1057}
1058
1059//
1060// pinIndex        Do range pinning on a native index parameter.
1061//                 64 bit pinning is done in place.
1062//                 32 bit truncated result is returned as a convenience for
1063//                        use in providers that don't need 64 bits.
1064static int32_t
1065pinIndex(int64_t &index, int64_t limit) {
1066    if (index<0) {
1067        index = 0;
1068    } else if (index > limit) {
1069        index = limit;
1070    }
1071    return (int32_t)index;
1072}
1073
1074
1075U_CDECL_BEGIN
1076
1077//
1078// Pointer relocation function,
1079//   a utility used by shallow clone.
1080//   Adjust a pointer that refers to something within one UText (the source)
1081//   to refer to the same relative offset within a another UText (the target)
1082//
1083static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
1084    // convert all pointers to (char *) so that byte address arithmetic will work.
1085    char  *dptr = (char *)*destPtr;
1086    char  *dUText = (char *)dest;
1087    char  *sUText = (char *)src;
1088
1089    if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
1090        // target ptr was to something within the src UText's pExtra storage.
1091        //   relocate it into the target UText's pExtra region.
1092        *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
1093    } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
1094        // target ptr was pointing to somewhere within the source UText itself.
1095        //   Move it to the same offset within the target UText.
1096        *destPtr = dUText + (dptr-sUText);
1097    }
1098}
1099
1100
1101//
1102//  Clone.  This is a generic copy-the-utext-by-value clone function that can be
1103//          used as-is with some utext types, and as a helper by other clones.
1104//
1105static UText * U_CALLCONV
1106shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
1107    if (U_FAILURE(*status)) {
1108        return NULL;
1109    }
1110    int32_t  srcExtraSize = src->extraSize;
1111
1112    //
1113    // Use the generic text_setup to allocate storage if required.
1114    //
1115    dest = utext_setup(dest, srcExtraSize, status);
1116    if (U_FAILURE(*status)) {
1117        return dest;
1118    }
1119
1120    //
1121    //  flags (how the UText was allocated) and the pointer to the
1122    //   extra storage must retain the values in the cloned utext that
1123    //   were set up by utext_setup.  Save them separately before
1124    //   copying the whole struct.
1125    //
1126    void *destExtra = dest->pExtra;
1127    int32_t flags   = dest->flags;
1128
1129
1130    //
1131    //  Copy the whole UText struct by value.
1132    //  Any "Extra" storage is copied also.
1133    //
1134    int sizeToCopy = src->sizeOfStruct;
1135    if (sizeToCopy > dest->sizeOfStruct) {
1136        sizeToCopy = dest->sizeOfStruct;
1137    }
1138    uprv_memcpy(dest, src, sizeToCopy);
1139    dest->pExtra = destExtra;
1140    dest->flags  = flags;
1141    if (srcExtraSize > 0) {
1142        uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
1143    }
1144
1145    //
1146    // Relocate any pointers in the target that refer to the UText itself
1147    //   to point to the cloned copy rather than the original source.
1148    //
1149    adjustPointer(dest, &dest->context, src);
1150    adjustPointer(dest, &dest->p, src);
1151    adjustPointer(dest, &dest->q, src);
1152    adjustPointer(dest, &dest->r, src);
1153    adjustPointer(dest, (const void **)&dest->chunkContents, src);
1154
1155    return dest;
1156}
1157
1158
1159U_CDECL_END
1160
1161
1162
1163//------------------------------------------------------------------------------
1164//
1165//     UText implementation for UTF-8 char * strings (read-only)
1166//     Limitation:  string length must be <= 0x7fffffff in length.
1167//                  (length must for in an int32_t variable)
1168//
1169//         Use of UText data members:
1170//              context    pointer to UTF-8 string
1171//              utext.b    is the input string length (bytes).
1172//              utext.c    Length scanned so far in string
1173//                           (for optimizing finding length of zero terminated strings.)
1174//              utext.p    pointer to the current buffer
1175//              utext.q    pointer to the other buffer.
1176//
1177//------------------------------------------------------------------------------
1178
1179// Chunk size.
1180//     Must be less than 85, because of byte mapping from UChar indexes to native indexes.
1181//     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
1182//     to two UChars.)
1183//
1184enum { UTF8_TEXT_CHUNK_SIZE=32 };
1185
1186//
1187// UTF8Buf  Two of these structs will be set up in the UText's extra allocated space.
1188//          Each contains the UChar chunk buffer, the to and from native maps, and
1189//          header info.
1190//
1191//     because backwards iteration fills the buffers starting at the end and
1192//     working towards the front, the filled part of the buffers may not begin
1193//     at the start of the available storage for the buffers.
1194//
1195//     Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
1196//     the last character added being a supplementary, and thus requiring a surrogate
1197//     pair.  Doing this is simpler than checking for the edge case.
1198//
1199
1200struct UTF8Buf {
1201    int32_t   bufNativeStart;                        // Native index of first char in UChar buf
1202    int32_t   bufNativeLimit;                        // Native index following last char in buf.
1203    int32_t   bufStartIdx;                           // First filled position in buf.
1204    int32_t   bufLimitIdx;                           // Limit of filled range in buf.
1205    int32_t   bufNILimit;                            // Limit of native indexing part of buf
1206    int32_t   toUCharsMapStart;                      // Native index corresponding to
1207                                                     //   mapToUChars[0].
1208                                                     //   Set to bufNativeStart when filling forwards.
1209                                                     //   Set to computed value when filling backwards.
1210
1211    UChar     buf[UTF8_TEXT_CHUNK_SIZE+4];           // The UChar buffer.  Requires one extra position beyond the
1212                                                     //   the chunk size, to allow for surrogate at the end.
1213                                                     //   Length must be identical to mapToNative array, below,
1214                                                     //   because of the way indexing works when the array is
1215                                                     //   filled backwards during a reverse iteration.  Thus,
1216                                                     //   the additional extra size.
1217    uint8_t   mapToNative[UTF8_TEXT_CHUNK_SIZE+4];   // map UChar index in buf to
1218                                                     //  native offset from bufNativeStart.
1219                                                     //  Requires two extra slots,
1220                                                     //    one for a supplementary starting in the last normal position,
1221                                                     //    and one for an entry for the buffer limit position.
1222    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
1223                                                     //   correspoding offset in filled part of buf.
1224    int32_t   align;
1225};
1226
1227U_CDECL_BEGIN
1228
1229//
1230//   utf8TextLength
1231//
1232//        Get the length of the string.  If we don't already know it,
1233//              we'll need to scan for the trailing  nul.
1234//
1235static int64_t U_CALLCONV
1236utf8TextLength(UText *ut) {
1237    if (ut->b < 0) {
1238        // Zero terminated string, and we haven't scanned to the end yet.
1239        // Scan it now.
1240        const char *r = (const char *)ut->context + ut->c;
1241        while (*r != 0) {
1242            r++;
1243        }
1244        if ((r - (const char *)ut->context) < 0x7fffffff) {
1245            ut->b = (int32_t)(r - (const char *)ut->context);
1246        } else {
1247            // Actual string was bigger (more than 2 gig) than we
1248            //   can handle.  Clip it to 2 GB.
1249            ut->b = 0x7fffffff;
1250        }
1251        ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1252    }
1253    return ut->b;
1254}
1255
1256
1257
1258
1259
1260
1261static UBool U_CALLCONV
1262utf8TextAccess(UText *ut, int64_t index, UBool forward) {
1263    //
1264    //  Apologies to those who are allergic to goto statements.
1265    //    Consider each goto to a labelled block to be the equivalent of
1266    //         call the named block as if it were a function();
1267    //         return;
1268    //
1269    const uint8_t *s8=(const uint8_t *)ut->context;
1270    UTF8Buf *u8b = NULL;
1271    int32_t  length = ut->b;         // Length of original utf-8
1272    int32_t  ix= (int32_t)index;     // Requested index, trimmed to 32 bits.
1273    int32_t  mapIndex = 0;
1274    if (index<0) {
1275        ix=0;
1276    } else if (index > 0x7fffffff) {
1277        // Strings with 64 bit lengths not supported by this UTF-8 provider.
1278        ix = 0x7fffffff;
1279    }
1280
1281    // Pin requested index to the string length.
1282    if (ix>length) {
1283        if (length>=0) {
1284            ix=length;
1285        } else if (ix>=ut->c) {
1286            // Zero terminated string, and requested index is beyond
1287            //   the region that has already been scanned.
1288            //   Scan up to either the end of the string or to the
1289            //   requested position, whichever comes first.
1290            while (ut->c<ix && s8[ut->c]!=0) {
1291                ut->c++;
1292            }
1293            //  TODO:  support for null terminated string length > 32 bits.
1294            if (s8[ut->c] == 0) {
1295                // We just found the actual length of the string.
1296                //  Trim the requested index back to that.
1297                ix     = ut->c;
1298                ut->b  = ut->c;
1299                length = ut->c;
1300                ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1301            }
1302        }
1303    }
1304
1305    //
1306    // Dispatch to the appropriate action for a forward iteration request.
1307    //
1308    if (forward) {
1309        if (ix==ut->chunkNativeLimit) {
1310            // Check for normal sequential iteration cases first.
1311            if (ix==length) {
1312                // Just reached end of string
1313                // Don't swap buffers, but do set the
1314                //   current buffer position.
1315                ut->chunkOffset = ut->chunkLength;
1316                return FALSE;
1317            } else {
1318                // End of current buffer.
1319                //   check whether other buffer already has what we need.
1320                UTF8Buf *altB = (UTF8Buf *)ut->q;
1321                if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
1322                    goto swapBuffers;
1323                }
1324            }
1325        }
1326
1327        // A random access.  Desired index could be in either or niether buf.
1328        // For optimizing the order of testing, first check for the index
1329        //    being in the other buffer.  This will be the case for uses that
1330        //    move back and forth over a fairly limited range
1331        {
1332            u8b = (UTF8Buf *)ut->q;   // the alternate buffer
1333            if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
1334                // Requested index is in the other buffer.
1335                goto swapBuffers;
1336            }
1337            if (ix == length) {
1338                // Requested index is end-of-string.
1339                //   (this is the case of randomly seeking to the end.
1340                //    The case of iterating off the end is handled earlier.)
1341                if (ix == ut->chunkNativeLimit) {
1342                    // Current buffer extends up to the end of the string.
1343                    //   Leave it as the current buffer.
1344                    ut->chunkOffset = ut->chunkLength;
1345                    return FALSE;
1346                }
1347                if (ix == u8b->bufNativeLimit) {
1348                    // Alternate buffer extends to the end of string.
1349                    //   Swap it in as the current buffer.
1350                    goto swapBuffersAndFail;
1351                }
1352
1353                // Neither existing buffer extends to the end of the string.
1354                goto makeStubBuffer;
1355            }
1356
1357            if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
1358                // Requested index is in neither buffer.
1359                goto fillForward;
1360            }
1361
1362            // Requested index is in this buffer.
1363            u8b = (UTF8Buf *)ut->p;   // the current buffer
1364            mapIndex = ix - u8b->toUCharsMapStart;
1365            ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1366            return TRUE;
1367
1368        }
1369    }
1370
1371
1372    //
1373    // Dispatch to the appropriate action for a
1374    //   Backwards Diretion iteration request.
1375    //
1376    if (ix==ut->chunkNativeStart) {
1377        // Check for normal sequential iteration cases first.
1378        if (ix==0) {
1379            // Just reached the start of string
1380            // Don't swap buffers, but do set the
1381            //   current buffer position.
1382            ut->chunkOffset = 0;
1383            return FALSE;
1384        } else {
1385            // Start of current buffer.
1386            //   check whether other buffer already has what we need.
1387            UTF8Buf *altB = (UTF8Buf *)ut->q;
1388            if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
1389                goto swapBuffers;
1390            }
1391        }
1392    }
1393
1394    // A random access.  Desired index could be in either or niether buf.
1395    // For optimizing the order of testing,
1396    //    Most likely case:  in the other buffer.
1397    //    Second most likely: in neither buffer.
1398    //    Unlikely, but must work:  in the current buffer.
1399    u8b = (UTF8Buf *)ut->q;   // the alternate buffer
1400    if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
1401        // Requested index is in the other buffer.
1402        goto swapBuffers;
1403    }
1404    // Requested index is start-of-string.
1405    //   (this is the case of randomly seeking to the start.
1406    //    The case of iterating off the start is handled earlier.)
1407    if (ix==0) {
1408        if (u8b->bufNativeStart==0) {
1409            // Alternate buffer contains the data for the start string.
1410            // Make it be the current buffer.
1411            goto swapBuffersAndFail;
1412        } else {
1413            // Request for data before the start of string,
1414            //   neither buffer is usable.
1415            //   set up a zero-length buffer.
1416            goto makeStubBuffer;
1417        }
1418    }
1419
1420    if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
1421        // Requested index is in neither buffer.
1422        goto fillReverse;
1423    }
1424
1425    // Requested index is in this buffer.
1426    //   Set the utf16 buffer index.
1427    u8b = (UTF8Buf *)ut->p;
1428    mapIndex = ix - u8b->toUCharsMapStart;
1429    ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1430    if (ut->chunkOffset==0) {
1431        // This occurs when the first character in the text is
1432        //   a multi-byte UTF-8 char, and the requested index is to
1433        //   one of the trailing bytes.  Because there is no preceding ,
1434        //   character, this access fails.  We can't pick up on the
1435        //   situation sooner because the requested index is not zero.
1436        return FALSE;
1437    } else {
1438        return TRUE;
1439    }
1440
1441
1442
1443swapBuffers:
1444    //  The alternate buffer (ut->q) has the string data that was requested.
1445    //  Swap the primary and alternate buffers, and set the
1446    //   chunk index into the new primary buffer.
1447    {
1448        u8b   = (UTF8Buf *)ut->q;
1449        ut->q = ut->p;
1450        ut->p = u8b;
1451        ut->chunkContents       = &u8b->buf[u8b->bufStartIdx];
1452        ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
1453        ut->chunkNativeStart    = u8b->bufNativeStart;
1454        ut->chunkNativeLimit    = u8b->bufNativeLimit;
1455        ut->nativeIndexingLimit = u8b->bufNILimit;
1456
1457        // Index into the (now current) chunk
1458        // Use the map to set the chunk index.  It's more trouble than it's worth
1459        //    to check whether native indexing can be used.
1460        U_ASSERT(ix>=u8b->bufNativeStart);
1461        U_ASSERT(ix<=u8b->bufNativeLimit);
1462        mapIndex = ix - u8b->toUCharsMapStart;
1463        U_ASSERT(mapIndex>=0);
1464        U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
1465        ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1466
1467        return TRUE;
1468    }
1469
1470
1471 swapBuffersAndFail:
1472    // We got a request for either the start or end of the string,
1473    //  with iteration continuing in the out-of-bounds direction.
1474    // The alternate buffer already contains the data up to the
1475    //  start/end.
1476    // Swap the buffers, then return failure, indicating that we couldn't
1477    //  make things correct for continuing the iteration in the requested
1478    //  direction.  The position & buffer are correct should the
1479    //  user decide to iterate in the opposite direction.
1480    u8b   = (UTF8Buf *)ut->q;
1481    ut->q = ut->p;
1482    ut->p = u8b;
1483    ut->chunkContents       = &u8b->buf[u8b->bufStartIdx];
1484    ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
1485    ut->chunkNativeStart    = u8b->bufNativeStart;
1486    ut->chunkNativeLimit    = u8b->bufNativeLimit;
1487    ut->nativeIndexingLimit = u8b->bufNILimit;
1488
1489    // Index into the (now current) chunk
1490    //  For this function  (swapBuffersAndFail), the requested index
1491    //    will always be at either the start or end of the chunk.
1492    if (ix==u8b->bufNativeLimit) {
1493        ut->chunkOffset = ut->chunkLength;
1494    } else  {
1495        ut->chunkOffset = 0;
1496        U_ASSERT(ix == u8b->bufNativeStart);
1497    }
1498    return FALSE;
1499
1500makeStubBuffer:
1501    //   The user has done a seek/access past the start or end
1502    //   of the string.  Rather than loading data that is likely
1503    //   to never be used, just set up a zero-length buffer at
1504    //   the position.
1505    u8b = (UTF8Buf *)ut->q;
1506    u8b->bufNativeStart   = ix;
1507    u8b->bufNativeLimit   = ix;
1508    u8b->bufStartIdx      = 0;
1509    u8b->bufLimitIdx      = 0;
1510    u8b->bufNILimit       = 0;
1511    u8b->toUCharsMapStart = ix;
1512    u8b->mapToNative[0]   = 0;
1513    u8b->mapToUChars[0]   = 0;
1514    goto swapBuffersAndFail;
1515
1516
1517
1518fillForward:
1519    {
1520        // Move the incoming index to a code point boundary.
1521        U8_SET_CP_START(s8, 0, ix);
1522
1523        // Swap the UText buffers.
1524        //  We want to fill what was previously the alternate buffer,
1525        //  and make what was the current buffer be the new alternate.
1526        UTF8Buf *u8b = (UTF8Buf *)ut->q;
1527        ut->q = ut->p;
1528        ut->p = u8b;
1529
1530        int32_t strLen = ut->b;
1531        UBool   nulTerminated = FALSE;
1532        if (strLen < 0) {
1533            strLen = 0x7fffffff;
1534            nulTerminated = TRUE;
1535        }
1536
1537        UChar   *buf = u8b->buf;
1538        uint8_t *mapToNative  = u8b->mapToNative;
1539        uint8_t *mapToUChars  = u8b->mapToUChars;
1540        int32_t  destIx       = 0;
1541        int32_t  srcIx        = ix;
1542        UBool    seenNonAscii = FALSE;
1543        UChar32  c = 0;
1544
1545        // Fill the chunk buffer and mapping arrays.
1546        while (destIx<UTF8_TEXT_CHUNK_SIZE) {
1547            c = s8[srcIx];
1548            if (c>0 && c<0x80) {
1549                // Special case ASCII range for speed.
1550                //   zero is excluded to simplify bounds checking.
1551                buf[destIx] = (UChar)c;
1552                mapToNative[destIx]    = (uint8_t)(srcIx - ix);
1553                mapToUChars[srcIx-ix]  = (uint8_t)destIx;
1554                srcIx++;
1555                destIx++;
1556            } else {
1557                // General case, handle everything.
1558                if (seenNonAscii == FALSE) {
1559                    seenNonAscii = TRUE;
1560                    u8b->bufNILimit = destIx;
1561                }
1562
1563                int32_t  cIx      = srcIx;
1564                int32_t  dIx      = destIx;
1565                int32_t  dIxSaved = destIx;
1566                U8_NEXT(s8, srcIx, strLen, c);
1567                if (c==0 && nulTerminated) {
1568                    srcIx--;
1569                    break;
1570                }
1571                if (c<0) {
1572                    // Illegal UTF-8.  Replace with sub character.
1573                    c = 0x0fffd;
1574                }
1575
1576                U16_APPEND_UNSAFE(buf, destIx, c);
1577                do {
1578                    mapToNative[dIx++] = (uint8_t)(cIx - ix);
1579                } while (dIx < destIx);
1580
1581                do {
1582                    mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
1583                } while (cIx < srcIx);
1584            }
1585            if (srcIx>=strLen) {
1586                break;
1587            }
1588
1589        }
1590
1591        //  store Native <--> Chunk Map entries for the end of the buffer.
1592        //    There is no actual character here, but the index position is valid.
1593        mapToNative[destIx]     = (uint8_t)(srcIx - ix);
1594        mapToUChars[srcIx - ix] = (uint8_t)destIx;
1595
1596        //  fill in Buffer descriptor
1597        u8b->bufNativeStart     = ix;
1598        u8b->bufNativeLimit     = srcIx;
1599        u8b->bufStartIdx        = 0;
1600        u8b->bufLimitIdx        = destIx;
1601        if (seenNonAscii == FALSE) {
1602            u8b->bufNILimit     = destIx;
1603        }
1604        u8b->toUCharsMapStart   = u8b->bufNativeStart;
1605
1606        // Set UText chunk to refer to this buffer.
1607        ut->chunkContents       = buf;
1608        ut->chunkOffset         = 0;
1609        ut->chunkLength         = u8b->bufLimitIdx;
1610        ut->chunkNativeStart    = u8b->bufNativeStart;
1611        ut->chunkNativeLimit    = u8b->bufNativeLimit;
1612        ut->nativeIndexingLimit = u8b->bufNILimit;
1613
1614        // For zero terminated strings, keep track of the maximum point
1615        //   scanned so far.
1616        if (nulTerminated && srcIx>ut->c) {
1617            ut->c = srcIx;
1618            if (c==0) {
1619                // We scanned to the end.
1620                //   Remember the actual length.
1621                ut->b = srcIx;
1622                ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1623            }
1624        }
1625        return TRUE;
1626    }
1627
1628
1629fillReverse:
1630    {
1631        // Move the incoming index to a code point boundary.
1632        // Can only do this if the incoming index is somewhere in the interior of the string.
1633        //   If index is at the end, there is no character there to look at.
1634        if (ix != ut->b) {
1635            U8_SET_CP_START(s8, 0, ix);
1636        }
1637
1638        // Swap the UText buffers.
1639        //  We want to fill what was previously the alternate buffer,
1640        //  and make what was the current buffer be the new alternate.
1641        UTF8Buf *u8b = (UTF8Buf *)ut->q;
1642        ut->q = ut->p;
1643        ut->p = u8b;
1644
1645        UChar   *buf = u8b->buf;
1646        uint8_t *mapToNative = u8b->mapToNative;
1647        uint8_t *mapToUChars = u8b->mapToUChars;
1648        int32_t  toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
1649        int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
1650                                                    //   at end of buffer to leave room
1651                                                    //   for a surrogate pair at the
1652                                                    //   buffer start.
1653        int32_t  srcIx  = ix;
1654        int32_t  bufNILimit = destIx;
1655        UChar32   c;
1656
1657        // Map to/from Native Indexes, fill in for the position at the end of
1658        //   the buffer.
1659        //
1660        mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1661        mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1662
1663        // Fill the chunk buffer
1664        // Work backwards, filling from the end of the buffer towards the front.
1665        //
1666        while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
1667            srcIx--;
1668            destIx--;
1669
1670            // Get last byte of the UTF-8 character
1671            c = s8[srcIx];
1672            if (c<0x80) {
1673                // Special case ASCII range for speed.
1674                buf[destIx] = (UChar)c;
1675                mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1676                mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1677            } else {
1678                // General case, handle everything non-ASCII.
1679
1680                int32_t  sIx      = srcIx;  // ix of last byte of multi-byte u8 char
1681
1682                // Get the full character from the UTF8 string.
1683                //   use code derived from tbe macros in utf.8
1684                //   Leaves srcIx pointing at the first byte of the UTF-8 char.
1685                //
1686                if (c<=0xbf) {
1687                    c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1);
1688                    // leaves srcIx at first byte of the multi-byte char.
1689                } else {
1690                    c=0x0fffd;
1691                }
1692
1693                // Store the character in UTF-16 buffer.
1694                if (c<0x10000) {
1695                    buf[destIx] = (UChar)c;
1696                    mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1697                } else {
1698                    buf[destIx]         = U16_TRAIL(c);
1699                    mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1700                    buf[--destIx]       = U16_LEAD(c);
1701                    mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1702                }
1703
1704                // Fill in the map from native indexes to UChars buf index.
1705                do {
1706                    mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1707                } while (sIx >= srcIx);
1708
1709                // Set native indexing limit to be the current position.
1710                //   We are processing a non-ascii, non-native-indexing char now;
1711                //     the limit will be here if the rest of the chars to be
1712                //     added to this buffer are ascii.
1713                bufNILimit = destIx;
1714            }
1715        }
1716        u8b->bufNativeStart     = srcIx;
1717        u8b->bufNativeLimit     = ix;
1718        u8b->bufStartIdx        = destIx;
1719        u8b->bufLimitIdx        = UTF8_TEXT_CHUNK_SIZE+2;
1720        u8b->bufNILimit         = bufNILimit - u8b->bufStartIdx;
1721        u8b->toUCharsMapStart   = toUCharsMapStart;
1722
1723        ut->chunkContents       = &buf[u8b->bufStartIdx];
1724        ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
1725        ut->chunkOffset         = ut->chunkLength;
1726        ut->chunkNativeStart    = u8b->bufNativeStart;
1727        ut->chunkNativeLimit    = u8b->bufNativeLimit;
1728        ut->nativeIndexingLimit = u8b->bufNILimit;
1729        return TRUE;
1730    }
1731
1732}
1733
1734
1735
1736//
1737//  This is a slightly modified copy of u_strFromUTF8,
1738//     Inserts a Replacement Char rather than failing on invalid UTF-8
1739//     Removes unnecessary features.
1740//
1741static UChar*
1742utext_strFromUTF8(UChar *dest,
1743              int32_t destCapacity,
1744              int32_t *pDestLength,
1745              const char* src,
1746              int32_t srcLength,        // required.  NUL terminated not supported.
1747              UErrorCode *pErrorCode
1748              )
1749{
1750
1751    UChar *pDest = dest;
1752    UChar *pDestLimit = dest+destCapacity;
1753    UChar32 ch=0;
1754    int32_t index = 0;
1755    int32_t reqLength = 0;
1756    uint8_t* pSrc = (uint8_t*) src;
1757
1758
1759    while((index < srcLength)&&(pDest<pDestLimit)){
1760        ch = pSrc[index++];
1761        if(ch <=0x7f){
1762            *pDest++=(UChar)ch;
1763        }else{
1764            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1765            if(ch<0){
1766                ch = 0xfffd;
1767            }
1768            if(U_IS_BMP(ch)){
1769                *(pDest++)=(UChar)ch;
1770            }else{
1771                *(pDest++)=UTF16_LEAD(ch);
1772                if(pDest<pDestLimit){
1773                    *(pDest++)=UTF16_TRAIL(ch);
1774                }else{
1775                    reqLength++;
1776                    break;
1777                }
1778            }
1779        }
1780    }
1781    /* donot fill the dest buffer just count the UChars needed */
1782    while(index < srcLength){
1783        ch = pSrc[index++];
1784        if(ch <= 0x7f){
1785            reqLength++;
1786        }else{
1787            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1788            if(ch<0){
1789                ch = 0xfffd;
1790            }
1791            reqLength+=U16_LENGTH(ch);
1792        }
1793    }
1794
1795    reqLength+=(int32_t)(pDest - dest);
1796
1797    if(pDestLength){
1798        *pDestLength = reqLength;
1799    }
1800
1801    /* Terminate the buffer */
1802    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
1803
1804    return dest;
1805}
1806
1807
1808
1809static int32_t U_CALLCONV
1810utf8TextExtract(UText *ut,
1811                int64_t start, int64_t limit,
1812                UChar *dest, int32_t destCapacity,
1813                UErrorCode *pErrorCode) {
1814    if(U_FAILURE(*pErrorCode)) {
1815        return 0;
1816    }
1817    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1818        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1819        return 0;
1820    }
1821    int32_t  length  = ut->b;
1822    int32_t  start32 = pinIndex(start, length);
1823    int32_t  limit32 = pinIndex(limit, length);
1824
1825    if(start32>limit32) {
1826        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1827        return 0;
1828    }
1829
1830
1831    // adjust the incoming indexes to land on code point boundaries if needed.
1832    //    adjust by no more than three, because that is the largest number of trail bytes
1833    //    in a well formed UTF8 character.
1834    const uint8_t *buf = (const uint8_t *)ut->context;
1835    int i;
1836    if (start32 < ut->chunkNativeLimit) {
1837        for (i=0; i<3; i++) {
1838            if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
1839                break;
1840            }
1841            start32--;
1842        }
1843    }
1844
1845    if (limit32 < ut->chunkNativeLimit) {
1846        for (i=0; i<3; i++) {
1847            if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
1848                break;
1849            }
1850            limit32--;
1851        }
1852    }
1853
1854    // Do the actual extract.
1855    int32_t destLength=0;
1856    utext_strFromUTF8(dest, destCapacity, &destLength,
1857                    (const char *)ut->context+start32, limit32-start32,
1858                    pErrorCode);
1859    utf8TextAccess(ut, limit32, TRUE);
1860    return destLength;
1861}
1862
1863//
1864// utf8TextMapOffsetToNative
1865//
1866// Map a chunk (UTF-16) offset to a native index.
1867static int64_t U_CALLCONV
1868utf8TextMapOffsetToNative(const UText *ut) {
1869    //
1870    UTF8Buf *u8b = (UTF8Buf *)ut->p;
1871    U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
1872    int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
1873    U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
1874    return nativeOffset;
1875}
1876
1877//
1878// Map a native index to the corrsponding chunk offset
1879//
1880static int32_t U_CALLCONV
1881utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
1882    U_ASSERT(index64 <= 0x7fffffff);
1883    int32_t index = (int32_t)index64;
1884    UTF8Buf *u8b = (UTF8Buf *)ut->p;
1885    U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1886    U_ASSERT(index<=ut->chunkNativeLimit);
1887    int32_t mapIndex = index - u8b->toUCharsMapStart;
1888    int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1889    U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1890    return offset;
1891}
1892
1893static UText * U_CALLCONV
1894utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
1895{
1896    // First do a generic shallow clone.  Does everything needed for the UText struct itself.
1897    dest = shallowTextClone(dest, src, status);
1898
1899    // For deep clones, make a copy of the string.
1900    //  The copied storage is owned by the newly created clone.
1901    //
1902    // TODO:  There is an isssue with using utext_nativeLength().
1903    //        That function is non-const in cases where the input was NUL terminated
1904    //          and the length has not yet been determined.
1905    //        This function (clone()) is const.
1906    //        There potentially a thread safety issue lurking here.
1907    //
1908    if (deep && U_SUCCESS(*status)) {
1909        int32_t  len = (int32_t)utext_nativeLength((UText *)src);
1910        char *copyStr = (char *)uprv_malloc(len+1);
1911        if (copyStr == NULL) {
1912            *status = U_MEMORY_ALLOCATION_ERROR;
1913        } else {
1914            uprv_memcpy(copyStr, src->context, len+1);
1915            dest->context = copyStr;
1916            dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1917        }
1918    }
1919    return dest;
1920}
1921
1922
1923static void U_CALLCONV
1924utf8TextClose(UText *ut) {
1925    // Most of the work of close is done by the generic UText framework close.
1926    // All that needs to be done here is to delete the UTF8 string if the UText
1927    //  owns it.  This occurs if the UText was created by cloning.
1928    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1929        char *s = (char *)ut->context;
1930        uprv_free(s);
1931        ut->context = NULL;
1932    }
1933}
1934
1935U_CDECL_END
1936
1937
1938static const struct UTextFuncs utf8Funcs =
1939{
1940    sizeof(UTextFuncs),
1941    0, 0, 0,             // Reserved alignment padding
1942    utf8TextClone,
1943    utf8TextLength,
1944    utf8TextAccess,
1945    utf8TextExtract,
1946    NULL,                /* replace*/
1947    NULL,                /* copy   */
1948    utf8TextMapOffsetToNative,
1949    utf8TextMapIndexToUTF16,
1950    utf8TextClose,
1951    NULL,                // spare 1
1952    NULL,                // spare 2
1953    NULL                 // spare 3
1954};
1955
1956
1957static const char gEmptyString[] = {0};
1958
1959U_CAPI UText * U_EXPORT2
1960utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
1961    if(U_FAILURE(*status)) {
1962        return NULL;
1963    }
1964    if(s==NULL && length==0) {
1965        s = gEmptyString;
1966    }
1967
1968    if(s==NULL || length<-1 || length>INT32_MAX) {
1969        *status=U_ILLEGAL_ARGUMENT_ERROR;
1970        return NULL;
1971    }
1972
1973    ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
1974    if (U_FAILURE(*status)) {
1975        return ut;
1976    }
1977
1978    ut->pFuncs  = &utf8Funcs;
1979    ut->context = s;
1980    ut->b       = (int32_t)length;
1981    ut->c       = (int32_t)length;
1982    if (ut->c < 0) {
1983        ut->c = 0;
1984        ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1985    }
1986    ut->p = ut->pExtra;
1987    ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
1988    return ut;
1989
1990}
1991
1992
1993
1994
1995
1996
1997
1998
1999//------------------------------------------------------------------------------
2000//
2001//     UText implementation wrapper for Replaceable (read/write)
2002//
2003//         Use of UText data members:
2004//            context    pointer to Replaceable.
2005//            p          pointer to Replaceable if it is owned by the UText.
2006//
2007//------------------------------------------------------------------------------
2008
2009
2010
2011// minimum chunk size for this implementation: 3
2012// to allow for possible trimming for code point boundaries
2013enum { REP_TEXT_CHUNK_SIZE=10 };
2014
2015struct ReplExtra {
2016    /*
2017     * Chunk UChars.
2018     * +1 to simplify filling with surrogate pair at the end.
2019     */
2020    UChar s[REP_TEXT_CHUNK_SIZE+1];
2021};
2022
2023
2024U_CDECL_BEGIN
2025
2026static UText * U_CALLCONV
2027repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
2028    // First do a generic shallow clone.  Does everything needed for the UText struct itself.
2029    dest = shallowTextClone(dest, src, status);
2030
2031    // For deep clones, make a copy of the Replaceable.
2032    //  The copied Replaceable storage is owned by the newly created UText clone.
2033    //  A non-NULL pointer in UText.p is the signal to the close() function to delete
2034    //    it.
2035    //
2036    if (deep && U_SUCCESS(*status)) {
2037        const Replaceable *replSrc = (const Replaceable *)src->context;
2038        dest->context = replSrc->clone();
2039        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2040
2041        // with deep clone, the copy is writable, even when the source is not.
2042        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2043    }
2044    return dest;
2045}
2046
2047
2048static void U_CALLCONV
2049repTextClose(UText *ut) {
2050    // Most of the work of close is done by the generic UText framework close.
2051    // All that needs to be done here is delete the Replaceable if the UText
2052    //  owns it.  This occurs if the UText was created by cloning.
2053    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2054        Replaceable *rep = (Replaceable *)ut->context;
2055        delete rep;
2056        ut->context = NULL;
2057    }
2058}
2059
2060
2061static int64_t U_CALLCONV
2062repTextLength(UText *ut) {
2063    const Replaceable *replSrc = (const Replaceable *)ut->context;
2064    int32_t  len = replSrc->length();
2065    return len;
2066}
2067
2068
2069static UBool U_CALLCONV
2070repTextAccess(UText *ut, int64_t index, UBool forward) {
2071    const Replaceable *rep=(const Replaceable *)ut->context;
2072    int32_t length=rep->length();   // Full length of the input text (bigger than a chunk)
2073
2074    // clip the requested index to the limits of the text.
2075    int32_t index32 = pinIndex(index, length);
2076    U_ASSERT(index<=INT32_MAX);
2077
2078
2079    /*
2080     * Compute start/limit boundaries around index, for a segment of text
2081     * to be extracted.
2082     * To allow for the possibility that our user gave an index to the trailing
2083     * half of a surrogate pair, we must request one extra preceding UChar when
2084     * going in the forward direction.  This will ensure that the buffer has the
2085     * entire code point at the specified index.
2086     */
2087    if(forward) {
2088
2089        if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
2090            // Buffer already contains the requested position.
2091            ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
2092            return TRUE;
2093        }
2094        if (index32>=length && ut->chunkNativeLimit==length) {
2095            // Request for end of string, and buffer already extends up to it.
2096            // Can't get the data, but don't change the buffer.
2097            ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
2098            return FALSE;
2099        }
2100
2101        ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
2102        // Going forward, so we want to have the buffer with stuff at and beyond
2103        //   the requested index.  The -1 gets us one code point before the
2104        //   requested index also, to handle the case of the index being on
2105        //   a trail surrogate of a surrogate pair.
2106        if(ut->chunkNativeLimit > length) {
2107            ut->chunkNativeLimit = length;
2108        }
2109        // unless buffer ran off end, start is index-1.
2110        ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
2111        if(ut->chunkNativeStart < 0) {
2112            ut->chunkNativeStart = 0;
2113        }
2114    } else {
2115        // Reverse iteration.  Fill buffer with data preceding the requested index.
2116        if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
2117            // Requested position already in buffer.
2118            ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
2119            return TRUE;
2120        }
2121        if (index32==0 && ut->chunkNativeStart==0) {
2122            // Request for start, buffer already begins at start.
2123            //  No data, but keep the buffer as is.
2124            ut->chunkOffset = 0;
2125            return FALSE;
2126        }
2127
2128        // Figure out the bounds of the chunk to extract for reverse iteration.
2129        // Need to worry about chunk not splitting surrogate pairs, and while still
2130        // containing the data we need.
2131        // Fix by requesting a chunk that includes an extra UChar at the end.
2132        // If this turns out to be a lead surrogate, we can lop it off and still have
2133        //   the data we wanted.
2134        ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
2135        if (ut->chunkNativeStart < 0) {
2136            ut->chunkNativeStart = 0;
2137        }
2138
2139        ut->chunkNativeLimit = index32 + 1;
2140        if (ut->chunkNativeLimit > length) {
2141            ut->chunkNativeLimit = length;
2142        }
2143    }
2144
2145    // Extract the new chunk of text from the Replaceable source.
2146    ReplExtra *ex = (ReplExtra *)ut->pExtra;
2147    // UnicodeString with its buffer a writable alias to the chunk buffer
2148    UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
2149    rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
2150
2151    ut->chunkContents  = ex->s;
2152    ut->chunkLength    = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
2153    ut->chunkOffset    = (int32_t)(index32 - ut->chunkNativeStart);
2154
2155    // Surrogate pairs from the input text must not span chunk boundaries.
2156    // If end of chunk could be the start of a surrogate, trim it off.
2157    if (ut->chunkNativeLimit < length &&
2158        U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
2159            ut->chunkLength--;
2160            ut->chunkNativeLimit--;
2161            if (ut->chunkOffset > ut->chunkLength) {
2162                ut->chunkOffset = ut->chunkLength;
2163            }
2164        }
2165
2166    // if the first UChar in the chunk could be the trailing half of a surrogate pair,
2167    // trim it off.
2168    if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
2169        ++(ut->chunkContents);
2170        ++(ut->chunkNativeStart);
2171        --(ut->chunkLength);
2172        --(ut->chunkOffset);
2173    }
2174
2175    // adjust the index/chunkOffset to a code point boundary
2176    U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
2177
2178    // Use fast indexing for get/setNativeIndex()
2179    ut->nativeIndexingLimit = ut->chunkLength;
2180
2181    return TRUE;
2182}
2183
2184
2185
2186static int32_t U_CALLCONV
2187repTextExtract(UText *ut,
2188               int64_t start, int64_t limit,
2189               UChar *dest, int32_t destCapacity,
2190               UErrorCode *status) {
2191    const Replaceable *rep=(const Replaceable *)ut->context;
2192    int32_t  length=rep->length();
2193
2194    if(U_FAILURE(*status)) {
2195        return 0;
2196    }
2197    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
2198        *status=U_ILLEGAL_ARGUMENT_ERROR;
2199    }
2200    if(start>limit) {
2201        *status=U_INDEX_OUTOFBOUNDS_ERROR;
2202        return 0;
2203    }
2204
2205    int32_t  start32 = pinIndex(start, length);
2206    int32_t  limit32 = pinIndex(limit, length);
2207
2208    // adjust start, limit if they point to trail half of surrogates
2209    if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
2210        U_IS_SUPPLEMENTARY(rep->char32At(start32))){
2211            start32--;
2212    }
2213    if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
2214        U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
2215            limit32--;
2216    }
2217
2218    length=limit32-start32;
2219    if(length>destCapacity) {
2220        limit32 = start32 + destCapacity;
2221    }
2222    UnicodeString buffer(dest, 0, destCapacity); // writable alias
2223    rep->extractBetween(start32, limit32, buffer);
2224    repTextAccess(ut, limit32, TRUE);
2225
2226    return u_terminateUChars(dest, destCapacity, length, status);
2227}
2228
2229static int32_t U_CALLCONV
2230repTextReplace(UText *ut,
2231               int64_t start, int64_t limit,
2232               const UChar *src, int32_t length,
2233               UErrorCode *status) {
2234    Replaceable *rep=(Replaceable *)ut->context;
2235    int32_t oldLength;
2236
2237    if(U_FAILURE(*status)) {
2238        return 0;
2239    }
2240    if(src==NULL && length!=0) {
2241        *status=U_ILLEGAL_ARGUMENT_ERROR;
2242        return 0;
2243    }
2244    oldLength=rep->length(); // will subtract from new length
2245    if(start>limit ) {
2246        *status=U_INDEX_OUTOFBOUNDS_ERROR;
2247        return 0;
2248    }
2249
2250    int32_t start32 = pinIndex(start, oldLength);
2251    int32_t limit32 = pinIndex(limit, oldLength);
2252
2253    // Snap start & limit to code point boundaries.
2254    if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
2255        start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
2256    {
2257            start32--;
2258    }
2259    if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
2260        U16_IS_TRAIL(rep->charAt(limit32)))
2261    {
2262            limit32++;
2263    }
2264
2265    // Do the actual replace operation using methods of the Replaceable class
2266    UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
2267    rep->handleReplaceBetween(start32, limit32, replStr);
2268    int32_t newLength = rep->length();
2269    int32_t lengthDelta = newLength - oldLength;
2270
2271    // Is the UText chunk buffer OK?
2272    if (ut->chunkNativeLimit > start32) {
2273        // this replace operation may have impacted the current chunk.
2274        // invalidate it, which will force a reload on the next access.
2275        invalidateChunk(ut);
2276    }
2277
2278    // set the iteration position to the end of the newly inserted replacement text.
2279    int32_t newIndexPos = limit32 + lengthDelta;
2280    repTextAccess(ut, newIndexPos, TRUE);
2281
2282    return lengthDelta;
2283}
2284
2285
2286static void U_CALLCONV
2287repTextCopy(UText *ut,
2288                int64_t start, int64_t limit,
2289                int64_t destIndex,
2290                UBool move,
2291                UErrorCode *status)
2292{
2293    Replaceable *rep=(Replaceable *)ut->context;
2294    int32_t length=rep->length();
2295
2296    if(U_FAILURE(*status)) {
2297        return;
2298    }
2299    if (start>limit || (start<destIndex && destIndex<limit))
2300    {
2301        *status=U_INDEX_OUTOFBOUNDS_ERROR;
2302        return;
2303    }
2304
2305    int32_t start32     = pinIndex(start, length);
2306    int32_t limit32     = pinIndex(limit, length);
2307    int32_t destIndex32 = pinIndex(destIndex, length);
2308
2309    // TODO:  snap input parameters to code point boundaries.
2310
2311    if(move) {
2312        // move: copy to destIndex, then replace original with nothing
2313        int32_t segLength=limit32-start32;
2314        rep->copy(start32, limit32, destIndex32);
2315        if(destIndex32<start32) {
2316            start32+=segLength;
2317            limit32+=segLength;
2318        }
2319        rep->handleReplaceBetween(start32, limit32, UnicodeString());
2320    } else {
2321        // copy
2322        rep->copy(start32, limit32, destIndex32);
2323    }
2324
2325    // If the change to the text touched the region in the chunk buffer,
2326    //  invalidate the buffer.
2327    int32_t firstAffectedIndex = destIndex32;
2328    if (move && start32<firstAffectedIndex) {
2329        firstAffectedIndex = start32;
2330    }
2331    if (firstAffectedIndex < ut->chunkNativeLimit) {
2332        // changes may have affected range covered by the chunk
2333        invalidateChunk(ut);
2334    }
2335
2336    // Put iteration position at the newly inserted (moved) block,
2337    int32_t  nativeIterIndex = destIndex32 + limit32 - start32;
2338    if (move && destIndex32>start32) {
2339        // moved a block of text towards the end of the string.
2340        nativeIterIndex = destIndex32;
2341    }
2342
2343    // Set position, reload chunk if needed.
2344    repTextAccess(ut, nativeIterIndex, TRUE);
2345}
2346
2347static const struct UTextFuncs repFuncs =
2348{
2349    sizeof(UTextFuncs),
2350    0, 0, 0,           // Reserved alignment padding
2351    repTextClone,
2352    repTextLength,
2353    repTextAccess,
2354    repTextExtract,
2355    repTextReplace,
2356    repTextCopy,
2357    NULL,              // MapOffsetToNative,
2358    NULL,              // MapIndexToUTF16,
2359    repTextClose,
2360    NULL,              // spare 1
2361    NULL,              // spare 2
2362    NULL               // spare 3
2363};
2364
2365
2366U_CAPI UText * U_EXPORT2
2367utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
2368{
2369    if(U_FAILURE(*status)) {
2370        return NULL;
2371    }
2372    if(rep==NULL) {
2373        *status=U_ILLEGAL_ARGUMENT_ERROR;
2374        return NULL;
2375    }
2376    ut = utext_setup(ut, sizeof(ReplExtra), status);
2377
2378    ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2379    if(rep->hasMetaData()) {
2380        ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
2381    }
2382
2383    ut->pFuncs  = &repFuncs;
2384    ut->context =  rep;
2385    return ut;
2386}
2387
2388U_CDECL_END
2389
2390
2391
2392
2393
2394
2395
2396
2397//------------------------------------------------------------------------------
2398//
2399//     UText implementation for UnicodeString (read/write)  and
2400//                    for const UnicodeString (read only)
2401//             (same implementation, only the flags are different)
2402//
2403//         Use of UText data members:
2404//            context    pointer to UnicodeString
2405//            p          pointer to UnicodeString IF this UText owns the string
2406//                       and it must be deleted on close().  NULL otherwise.
2407//
2408//------------------------------------------------------------------------------
2409
2410U_CDECL_BEGIN
2411
2412
2413static UText * U_CALLCONV
2414unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
2415    // First do a generic shallow clone.  Does everything needed for the UText struct itself.
2416    dest = shallowTextClone(dest, src, status);
2417
2418    // For deep clones, make a copy of the UnicodeSring.
2419    //  The copied UnicodeString storage is owned by the newly created UText clone.
2420    //  A non-NULL pointer in UText.p is the signal to the close() function to delete
2421    //    the UText.
2422    //
2423    if (deep && U_SUCCESS(*status)) {
2424        const UnicodeString *srcString = (const UnicodeString *)src->context;
2425        dest->context = new UnicodeString(*srcString);
2426        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2427
2428        // with deep clone, the copy is writable, even when the source is not.
2429        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2430    }
2431    return dest;
2432}
2433
2434static void U_CALLCONV
2435unistrTextClose(UText *ut) {
2436    // Most of the work of close is done by the generic UText framework close.
2437    // All that needs to be done here is delete the UnicodeString if the UText
2438    //  owns it.  This occurs if the UText was created by cloning.
2439    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2440        UnicodeString *str = (UnicodeString *)ut->context;
2441        delete str;
2442        ut->context = NULL;
2443    }
2444}
2445
2446
2447static int64_t U_CALLCONV
2448unistrTextLength(UText *t) {
2449    return ((const UnicodeString *)t->context)->length();
2450}
2451
2452
2453static UBool U_CALLCONV
2454unistrTextAccess(UText *ut, int64_t index, UBool  forward) {
2455    int32_t length  = ut->chunkLength;
2456    ut->chunkOffset = pinIndex(index, length);
2457
2458    // Check whether request is at the start or end
2459    UBool retVal = (forward && index<length) || (!forward && index>0);
2460    return retVal;
2461}
2462
2463
2464
2465static int32_t U_CALLCONV
2466unistrTextExtract(UText *t,
2467                  int64_t start, int64_t limit,
2468                  UChar *dest, int32_t destCapacity,
2469                  UErrorCode *pErrorCode) {
2470    const UnicodeString *us=(const UnicodeString *)t->context;
2471    int32_t length=us->length();
2472
2473    if(U_FAILURE(*pErrorCode)) {
2474        return 0;
2475    }
2476    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
2477        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2478    }
2479    if(start<0 || start>limit) {
2480        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2481        return 0;
2482    }
2483
2484    int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
2485    int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
2486
2487    length=limit32-start32;
2488    if (destCapacity>0 && dest!=NULL) {
2489        int32_t trimmedLength = length;
2490        if(trimmedLength>destCapacity) {
2491            trimmedLength=destCapacity;
2492        }
2493        us->extract(start32, trimmedLength, dest);
2494        t->chunkOffset = start32+trimmedLength;
2495    } else {
2496        t->chunkOffset = start32;
2497    }
2498    u_terminateUChars(dest, destCapacity, length, pErrorCode);
2499    return length;
2500}
2501
2502static int32_t U_CALLCONV
2503unistrTextReplace(UText *ut,
2504                  int64_t start, int64_t limit,
2505                  const UChar *src, int32_t length,
2506                  UErrorCode *pErrorCode) {
2507    UnicodeString *us=(UnicodeString *)ut->context;
2508    int32_t oldLength;
2509
2510    if(U_FAILURE(*pErrorCode)) {
2511        return 0;
2512    }
2513    if(src==NULL && length!=0) {
2514        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2515    }
2516    if(start>limit) {
2517        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2518        return 0;
2519    }
2520    oldLength=us->length();
2521    int32_t start32 = pinIndex(start, oldLength);
2522    int32_t limit32 = pinIndex(limit, oldLength);
2523    if (start32 < oldLength) {
2524        start32 = us->getChar32Start(start32);
2525    }
2526    if (limit32 < oldLength) {
2527        limit32 = us->getChar32Start(limit32);
2528    }
2529
2530    // replace
2531    us->replace(start32, limit32-start32, src, length);
2532    int32_t newLength = us->length();
2533
2534    // Update the chunk description.
2535    ut->chunkContents    = us->getBuffer();
2536    ut->chunkLength      = newLength;
2537    ut->chunkNativeLimit = newLength;
2538    ut->nativeIndexingLimit = newLength;
2539
2540    // Set iteration position to the point just following the newly inserted text.
2541    int32_t lengthDelta = newLength - oldLength;
2542    ut->chunkOffset = limit32 + lengthDelta;
2543
2544    return lengthDelta;
2545}
2546
2547static void U_CALLCONV
2548unistrTextCopy(UText *ut,
2549               int64_t start, int64_t limit,
2550               int64_t destIndex,
2551               UBool move,
2552               UErrorCode *pErrorCode) {
2553    UnicodeString *us=(UnicodeString *)ut->context;
2554    int32_t length=us->length();
2555
2556    if(U_FAILURE(*pErrorCode)) {
2557        return;
2558    }
2559    int32_t start32 = pinIndex(start, length);
2560    int32_t limit32 = pinIndex(limit, length);
2561    int32_t destIndex32 = pinIndex(destIndex, length);
2562
2563    if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
2564        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2565        return;
2566    }
2567
2568    if(move) {
2569        // move: copy to destIndex, then replace original with nothing
2570        int32_t segLength=limit32-start32;
2571        us->copy(start32, limit32, destIndex32);
2572        if(destIndex32<start32) {
2573            start32+=segLength;
2574        }
2575        us->replace(start32, segLength, NULL, 0);
2576    } else {
2577        // copy
2578        us->copy(start32, limit32, destIndex32);
2579    }
2580
2581    // update chunk description, set iteration position.
2582    ut->chunkContents = us->getBuffer();
2583    if (move==FALSE) {
2584        // copy operation, string length grows
2585        ut->chunkLength += limit32-start32;
2586        ut->chunkNativeLimit = ut->chunkLength;
2587        ut->nativeIndexingLimit = ut->chunkLength;
2588    }
2589
2590    // Iteration position to end of the newly inserted text.
2591    ut->chunkOffset = destIndex32+limit32-start32;
2592    if (move && destIndex32>start32) {
2593        ut->chunkOffset = destIndex32;
2594    }
2595
2596}
2597
2598static const struct UTextFuncs unistrFuncs =
2599{
2600    sizeof(UTextFuncs),
2601    0, 0, 0,             // Reserved alignment padding
2602    unistrTextClone,
2603    unistrTextLength,
2604    unistrTextAccess,
2605    unistrTextExtract,
2606    unistrTextReplace,
2607    unistrTextCopy,
2608    NULL,                // MapOffsetToNative,
2609    NULL,                // MapIndexToUTF16,
2610    unistrTextClose,
2611    NULL,                // spare 1
2612    NULL,                // spare 2
2613    NULL                 // spare 3
2614};
2615
2616
2617
2618U_CDECL_END
2619
2620
2621U_CAPI UText * U_EXPORT2
2622utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
2623    // TODO:  use openConstUnicodeString, then add in the differences.
2624    //
2625    ut = utext_setup(ut, 0, status);
2626    if (U_SUCCESS(*status)) {
2627        ut->pFuncs              = &unistrFuncs;
2628        ut->context             = s;
2629        ut->providerProperties  = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
2630                                  I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2631
2632        ut->chunkContents       = s->getBuffer();
2633        ut->chunkLength         = s->length();
2634        ut->chunkNativeStart    = 0;
2635        ut->chunkNativeLimit    = ut->chunkLength;
2636        ut->nativeIndexingLimit = ut->chunkLength;
2637    }
2638    return ut;
2639}
2640
2641
2642
2643U_CAPI UText * U_EXPORT2
2644utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
2645    ut = utext_setup(ut, 0, status);
2646    //    note:  use the standard (writable) function table for UnicodeString.
2647    //           The flag settings disable writing, so having the functions in
2648    //           the table is harmless.
2649    if (U_SUCCESS(*status)) {
2650        ut->pFuncs              = &unistrFuncs;
2651        ut->context             = s;
2652        ut->providerProperties  = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2653        ut->chunkContents       = s->getBuffer();
2654        ut->chunkLength         = s->length();
2655        ut->chunkNativeStart    = 0;
2656        ut->chunkNativeLimit    = ut->chunkLength;
2657        ut->nativeIndexingLimit = ut->chunkLength;
2658    }
2659    return ut;
2660}
2661
2662//------------------------------------------------------------------------------
2663//
2664//     UText implementation for const UChar * strings
2665//
2666//         Use of UText data members:
2667//            context    pointer to UnicodeString
2668//            a          length.  -1 if not yet known.
2669//
2670//         TODO:  support 64 bit lengths.
2671//
2672//------------------------------------------------------------------------------
2673
2674U_CDECL_BEGIN
2675
2676
2677static UText * U_CALLCONV
2678ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
2679    // First do a generic shallow clone.
2680    dest = shallowTextClone(dest, src, status);
2681
2682    // For deep clones, make a copy of the string.
2683    //  The copied storage is owned by the newly created clone.
2684    //  A non-NULL pointer in UText.p is the signal to the close() function to delete
2685    //    it.
2686    //
2687    if (deep && U_SUCCESS(*status)) {
2688        U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
2689        int32_t  len = (int32_t)utext_nativeLength(dest);
2690
2691        // The cloned string IS going to be NUL terminated, whether or not the original was.
2692        const UChar *srcStr = (const UChar *)src->context;
2693        UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
2694        if (copyStr == NULL) {
2695            *status = U_MEMORY_ALLOCATION_ERROR;
2696        } else {
2697            int64_t i;
2698            for (i=0; i<len; i++) {
2699                copyStr[i] = srcStr[i];
2700            }
2701            copyStr[len] = 0;
2702            dest->context = copyStr;
2703            dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2704        }
2705    }
2706    return dest;
2707}
2708
2709
2710static void U_CALLCONV
2711ucstrTextClose(UText *ut) {
2712    // Most of the work of close is done by the generic UText framework close.
2713    // All that needs to be done here is delete the string if the UText
2714    //  owns it.  This occurs if the UText was created by cloning.
2715    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2716        UChar *s = (UChar *)ut->context;
2717        uprv_free(s);
2718        ut->context = NULL;
2719    }
2720}
2721
2722
2723
2724static int64_t U_CALLCONV
2725ucstrTextLength(UText *ut) {
2726    if (ut->a < 0) {
2727        // null terminated, we don't yet know the length.  Scan for it.
2728        //    Access is not convenient for doing this
2729        //    because the current interation postion can't be changed.
2730        const UChar  *str = (const UChar *)ut->context;
2731        for (;;) {
2732            if (str[ut->chunkNativeLimit] == 0) {
2733                break;
2734            }
2735            ut->chunkNativeLimit++;
2736        }
2737        ut->a = ut->chunkNativeLimit;
2738        ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2739        ut->nativeIndexingLimit = ut->chunkLength;
2740        ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2741    }
2742    return ut->a;
2743}
2744
2745
2746static UBool U_CALLCONV
2747ucstrTextAccess(UText *ut, int64_t index, UBool  forward) {
2748    const UChar *str   = (const UChar *)ut->context;
2749
2750    // pin the requested index to the bounds of the string,
2751    //  and set current iteration position.
2752    if (index<0) {
2753        index = 0;
2754    } else if (index < ut->chunkNativeLimit) {
2755        // The request data is within the chunk as it is known so far.
2756        // Put index on a code point boundary.
2757        U16_SET_CP_START(str, 0, index);
2758    } else if (ut->a >= 0) {
2759        // We know the length of this string, and the user is requesting something
2760        // at or beyond the length.  Pin the requested index to the length.
2761        index = ut->a;
2762    } else {
2763        // Null terminated string, length not yet known, and the requested index
2764        //  is beyond where we have scanned so far.
2765        //  Scan to 32 UChars beyond the requested index.  The strategy here is
2766        //  to avoid fully scanning a long string when the caller only wants to
2767        //  see a few characters at its beginning.
2768        int32_t scanLimit = (int32_t)index + 32;
2769        if ((index + 32)>INT32_MAX || (index + 32)<0 ) {   // note: int64 expression
2770            scanLimit = INT32_MAX;
2771        }
2772
2773        int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
2774        for (; chunkLimit<scanLimit; chunkLimit++) {
2775            if (str[chunkLimit] == 0) {
2776                // We found the end of the string.  Remember it, pin the requested index to it,
2777                //  and bail out of here.
2778                ut->a = chunkLimit;
2779                ut->chunkLength = chunkLimit;
2780                ut->nativeIndexingLimit = chunkLimit;
2781                if (index >= chunkLimit) {
2782                    index = chunkLimit;
2783                } else {
2784                    U16_SET_CP_START(str, 0, index);
2785                }
2786
2787                ut->chunkNativeLimit = chunkLimit;
2788                ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2789                goto breakout;
2790            }
2791        }
2792        // We scanned through the next batch of UChars without finding the end.
2793        U16_SET_CP_START(str, 0, index);
2794        if (chunkLimit == INT32_MAX) {
2795            // Scanned to the limit of a 32 bit length.
2796            // Forceably trim the overlength string back so length fits in int32
2797            //  TODO:  add support for 64 bit strings.
2798            ut->a = chunkLimit;
2799            ut->chunkLength = chunkLimit;
2800            ut->nativeIndexingLimit = chunkLimit;
2801            if (index > chunkLimit) {
2802                index = chunkLimit;
2803            }
2804            ut->chunkNativeLimit = chunkLimit;
2805            ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2806        } else {
2807            // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2808            // If the current end is on a lead surrogate, back the end up by one.
2809            // It doesn't matter if the end char happens to be an unpaired surrogate,
2810            //    and it's simpler not to worry about it.
2811            if (U16_IS_LEAD(str[chunkLimit-1])) {
2812                --chunkLimit;
2813            }
2814            // Null-terminated chunk with end still unknown.
2815            // Update the chunk length to reflect what has been scanned thus far.
2816            // That the full length is still unknown is (still) flagged by
2817            //    ut->a being < 0.
2818            ut->chunkNativeLimit = chunkLimit;
2819            ut->nativeIndexingLimit = chunkLimit;
2820            ut->chunkLength = chunkLimit;
2821        }
2822
2823    }
2824breakout:
2825    U_ASSERT(index<=INT32_MAX);
2826    ut->chunkOffset = (int32_t)index;
2827
2828    // Check whether request is at the start or end
2829    UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
2830    return retVal;
2831}
2832
2833
2834
2835static int32_t U_CALLCONV
2836ucstrTextExtract(UText *ut,
2837                  int64_t start, int64_t limit,
2838                  UChar *dest, int32_t destCapacity,
2839                  UErrorCode *pErrorCode)
2840{
2841    if(U_FAILURE(*pErrorCode)) {
2842        return 0;
2843    }
2844    if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2845        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2846        return 0;
2847    }
2848
2849    const UChar *s=(const UChar *)ut->context;
2850    int32_t si, di;
2851
2852    int32_t start32;
2853    int32_t limit32;
2854
2855    // Access the start.  Does two things we need:
2856    //   Pins 'start' to the length of the string, if it came in out-of-bounds.
2857    //   Snaps 'start' to the beginning of a code point.
2858    ucstrTextAccess(ut, start, TRUE);
2859    U_ASSERT(start <= INT32_MAX);
2860    start32 = (int32_t)start;
2861
2862    int32_t strLength=(int32_t)ut->a;
2863    if (strLength >= 0) {
2864        limit32 = pinIndex(limit, strLength);
2865    } else {
2866        limit32 = pinIndex(limit, INT32_MAX);
2867    }
2868
2869    di = 0;
2870    for (si=start32; si<limit32; si++) {
2871        if (strLength<0 && s[si]==0) {
2872            // Just hit the end of a null-terminated string.
2873            ut->a = si;               // set string length for this UText
2874            ut->chunkNativeLimit    = si;
2875            ut->chunkLength         = si;
2876            ut->nativeIndexingLimit = si;
2877            strLength               = si;
2878            break;
2879        }
2880        if (di<destCapacity) {
2881            // only store if there is space.
2882            dest[di] = s[si];
2883        } else {
2884            if (strLength>=0) {
2885                // We have filled the destination buffer, and the string length is known.
2886                //  Cut the loop short.  There is no need to scan string termination.
2887                di = limit32 - start32;
2888                si = limit32;
2889                break;
2890            }
2891        }
2892        di++;
2893    }
2894
2895    // If the limit index points to a lead surrogate of a pair,
2896    //   add the corresponding trail surrogate to the destination.
2897    if (si>0 && U16_IS_LEAD(s[si-1]) &&
2898        ((si<strLength || strLength<0)  && U16_IS_TRAIL(s[si])))
2899    {
2900        if (di<destCapacity) {
2901            // store only if there is space in the output buffer.
2902            dest[di++] = s[si++];
2903        }
2904    }
2905
2906    // Put iteration position at the point just following the extracted text
2907    ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
2908
2909    // Add a terminating NUL if space in the buffer permits,
2910    // and set the error status as required.
2911    u_terminateUChars(dest, destCapacity, di, pErrorCode);
2912    return di;
2913}
2914
2915static const struct UTextFuncs ucstrFuncs =
2916{
2917    sizeof(UTextFuncs),
2918    0, 0, 0,           // Reserved alignment padding
2919    ucstrTextClone,
2920    ucstrTextLength,
2921    ucstrTextAccess,
2922    ucstrTextExtract,
2923    NULL,              // Replace
2924    NULL,              // Copy
2925    NULL,              // MapOffsetToNative,
2926    NULL,              // MapIndexToUTF16,
2927    ucstrTextClose,
2928    NULL,              // spare 1
2929    NULL,              // spare 2
2930    NULL,              // spare 3
2931};
2932
2933U_CDECL_END
2934
2935static const UChar gEmptyUString[] = {0};
2936
2937U_CAPI UText * U_EXPORT2
2938utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
2939    if (U_FAILURE(*status)) {
2940        return NULL;
2941    }
2942    if(s==NULL && length==0) {
2943        s = gEmptyUString;
2944    }
2945    if (s==NULL || length < -1 || length>INT32_MAX) {
2946        *status = U_ILLEGAL_ARGUMENT_ERROR;
2947        return NULL;
2948    }
2949    ut = utext_setup(ut, 0, status);
2950    if (U_SUCCESS(*status)) {
2951        ut->pFuncs               = &ucstrFuncs;
2952        ut->context              = s;
2953        ut->providerProperties   = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2954        if (length==-1) {
2955            ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2956        }
2957        ut->a                    = length;
2958        ut->chunkContents        = s;
2959        ut->chunkNativeStart     = 0;
2960        ut->chunkNativeLimit     = length>=0? length : 0;
2961        ut->chunkLength          = (int32_t)ut->chunkNativeLimit;
2962        ut->chunkOffset          = 0;
2963        ut->nativeIndexingLimit  = ut->chunkLength;
2964    }
2965    return ut;
2966}
2967
2968
2969//------------------------------------------------------------------------------
2970//
2971//     UText implementation for text from ICU CharacterIterators
2972//
2973//         Use of UText data members:
2974//            context    pointer to the CharacterIterator
2975//            a          length of the full text.
2976//            p          pointer to  buffer 1
2977//            b          start index of local buffer 1 contents
2978//            q          pointer to buffer 2
2979//            c          start index of local buffer 2 contents
2980//            r          pointer to the character iterator if the UText owns it.
2981//                       Null otherwise.
2982//
2983//------------------------------------------------------------------------------
2984#define CIBufSize 16
2985
2986U_CDECL_BEGIN
2987static void U_CALLCONV
2988charIterTextClose(UText *ut) {
2989    // Most of the work of close is done by the generic UText framework close.
2990    // All that needs to be done here is delete the CharacterIterator if the UText
2991    //  owns it.  This occurs if the UText was created by cloning.
2992    CharacterIterator *ci = (CharacterIterator *)ut->r;
2993    delete ci;
2994    ut->r = NULL;
2995}
2996
2997static int64_t U_CALLCONV
2998charIterTextLength(UText *ut) {
2999    return (int32_t)ut->a;
3000}
3001
3002static UBool U_CALLCONV
3003charIterTextAccess(UText *ut, int64_t index, UBool  forward) {
3004    CharacterIterator *ci   = (CharacterIterator *)ut->context;
3005
3006    int32_t clippedIndex = (int32_t)index;
3007    if (clippedIndex<0) {
3008        clippedIndex=0;
3009    } else if (clippedIndex>=ut->a) {
3010        clippedIndex=(int32_t)ut->a;
3011    }
3012    int32_t neededIndex = clippedIndex;
3013    if (!forward && neededIndex>0) {
3014        // reverse iteration, want the position just before what was asked for.
3015        neededIndex--;
3016    } else if (forward && neededIndex==ut->a && neededIndex>0) {
3017        // Forward iteration, don't ask for something past the end of the text.
3018        neededIndex--;
3019    }
3020
3021    // Find the native index of the start of the buffer containing what we want.
3022    neededIndex -= neededIndex % CIBufSize;
3023
3024    UChar *buf = NULL;
3025    UBool  needChunkSetup = TRUE;
3026    int    i;
3027    if (ut->chunkNativeStart == neededIndex) {
3028        // The buffer we want is already the current chunk.
3029        needChunkSetup = FALSE;
3030    } else if (ut->b == neededIndex) {
3031        // The first buffer (buffer p) has what we need.
3032        buf = (UChar *)ut->p;
3033    } else if (ut->c == neededIndex) {
3034        // The second buffer (buffer q) has what we need.
3035        buf = (UChar *)ut->q;
3036    } else {
3037        // Neither buffer already has what we need.
3038        // Load new data from the character iterator.
3039        // Use the buf that is not the current buffer.
3040        buf = (UChar *)ut->p;
3041        if (ut->p == ut->chunkContents) {
3042            buf = (UChar *)ut->q;
3043        }
3044        ci->setIndex(neededIndex);
3045        for (i=0; i<CIBufSize; i++) {
3046            buf[i] = ci->nextPostInc();
3047            if (i+neededIndex > ut->a) {
3048                break;
3049            }
3050        }
3051    }
3052
3053    // We have a buffer with the data we need.
3054    // Set it up as the current chunk, if it wasn't already.
3055    if (needChunkSetup) {
3056        ut->chunkContents = buf;
3057        ut->chunkLength   = CIBufSize;
3058        ut->chunkNativeStart = neededIndex;
3059        ut->chunkNativeLimit = neededIndex + CIBufSize;
3060        if (ut->chunkNativeLimit > ut->a) {
3061            ut->chunkNativeLimit = ut->a;
3062            ut->chunkLength  = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
3063        }
3064        ut->nativeIndexingLimit = ut->chunkLength;
3065        U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
3066    }
3067    ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
3068    UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
3069    return success;
3070}
3071
3072static UText * U_CALLCONV
3073charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
3074    if (U_FAILURE(*status)) {
3075        return NULL;
3076    }
3077
3078    if (deep) {
3079        // There is no CharacterIterator API for cloning the underlying text storage.
3080        *status = U_UNSUPPORTED_ERROR;
3081        return NULL;
3082    } else {
3083        CharacterIterator *srcCI =(CharacterIterator *)src->context;
3084        srcCI = srcCI->clone();
3085        dest = utext_openCharacterIterator(dest, srcCI, status);
3086        // cast off const on getNativeIndex.
3087        //   For CharacterIterator based UTexts, this is safe, the operation is const.
3088        int64_t  ix = utext_getNativeIndex((UText *)src);
3089        utext_setNativeIndex(dest, ix);
3090        dest->r = srcCI;    // flags that this UText owns the CharacterIterator
3091    }
3092    return dest;
3093}
3094
3095static int32_t U_CALLCONV
3096charIterTextExtract(UText *ut,
3097                  int64_t start, int64_t limit,
3098                  UChar *dest, int32_t destCapacity,
3099                  UErrorCode *status)
3100{
3101    if(U_FAILURE(*status)) {
3102        return 0;
3103    }
3104    if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
3105        *status=U_ILLEGAL_ARGUMENT_ERROR;
3106        return 0;
3107    }
3108    int32_t  length  = (int32_t)ut->a;
3109    int32_t  start32 = pinIndex(start, length);
3110    int32_t  limit32 = pinIndex(limit, length);
3111    int32_t  desti   = 0;
3112    int32_t  srci;
3113    int32_t  copyLimit;
3114
3115    CharacterIterator *ci = (CharacterIterator *)ut->context;
3116    ci->setIndex32(start32);   // Moves ix to lead of surrogate pair, if needed.
3117    srci = ci->getIndex();
3118    copyLimit = srci;
3119    while (srci<limit32) {
3120        UChar32 c = ci->next32PostInc();
3121        int32_t  len = U16_LENGTH(c);
3122        if (desti+len <= destCapacity) {
3123            U16_APPEND_UNSAFE(dest, desti, c);
3124            copyLimit = srci+len;
3125        } else {
3126            desti += len;
3127            *status = U_BUFFER_OVERFLOW_ERROR;
3128        }
3129        srci += len;
3130    }
3131
3132    charIterTextAccess(ut, copyLimit, TRUE);
3133
3134    u_terminateUChars(dest, destCapacity, desti, status);
3135    return desti;
3136}
3137
3138static const struct UTextFuncs charIterFuncs =
3139{
3140    sizeof(UTextFuncs),
3141    0, 0, 0,             // Reserved alignment padding
3142    charIterTextClone,
3143    charIterTextLength,
3144    charIterTextAccess,
3145    charIterTextExtract,
3146    NULL,                // Replace
3147    NULL,                // Copy
3148    NULL,                // MapOffsetToNative,
3149    NULL,                // MapIndexToUTF16,
3150    charIterTextClose,
3151    NULL,                // spare 1
3152    NULL,                // spare 2
3153    NULL                 // spare 3
3154};
3155U_CDECL_END
3156
3157
3158U_CAPI UText * U_EXPORT2
3159utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
3160    if (U_FAILURE(*status)) {
3161        return NULL;
3162    }
3163
3164    if (ci->startIndex() > 0) {
3165        // No support for CharacterIterators that do not start indexing from zero.
3166        *status = U_UNSUPPORTED_ERROR;
3167        return NULL;
3168    }
3169
3170    // Extra space in UText for 2 buffers of CIBufSize UChars each.
3171    int32_t  extraSpace = 2 * CIBufSize * sizeof(UChar);
3172    ut = utext_setup(ut, extraSpace, status);
3173    if (U_SUCCESS(*status)) {
3174        ut->pFuncs                = &charIterFuncs;
3175        ut->context              = ci;
3176        ut->providerProperties   = 0;
3177        ut->a                    = ci->endIndex();        // Length of text
3178        ut->p                    = ut->pExtra;            // First buffer
3179        ut->b                    = -1;                    // Native index of first buffer contents
3180        ut->q                    = (UChar*)ut->pExtra+CIBufSize;  // Second buffer
3181        ut->c                    = -1;                    // Native index of second buffer contents
3182
3183        // Initialize current chunk contents to be empty.
3184        //   First access will fault something in.
3185        //   Note:  The initial nativeStart and chunkOffset must sum to zero
3186        //          so that getNativeIndex() will correctly compute to zero
3187        //          if no call to Access() has ever been made.  They can't be both
3188        //          zero without Access() thinking that the chunk is valid.
3189        ut->chunkContents        = (UChar *)ut->p;
3190        ut->chunkNativeStart     = -1;
3191        ut->chunkOffset          = 1;
3192        ut->chunkNativeLimit     = 0;
3193        ut->chunkLength          = 0;
3194        ut->nativeIndexingLimit  = ut->chunkOffset;  // enables native indexing
3195    }
3196    return ut;
3197}
3198
3199
3200
3201