1/*
2******************************************************************************
3*
4*   Copyright (C) 2007-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  unisetspan.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2007mar01
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/uniset.h"
19#include "unicode/ustring.h"
20#include "unicode/utf8.h"
21#include "unicode/utf16.h"
22#include "cmemory.h"
23#include "uvector.h"
24#include "unisetspan.h"
25
26U_NAMESPACE_BEGIN
27
28/*
29 * List of offsets from the current position from where to try matching
30 * a code point or a string.
31 * Store offsets rather than indexes to simplify the code and use the same list
32 * for both increments (in span()) and decrements (in spanBack()).
33 *
34 * Assumption: The maximum offset is limited, and the offsets that are stored
35 * at any one time are relatively dense, that is, there are normally no gaps of
36 * hundreds or thousands of offset values.
37 *
38 * The implementation uses a circular buffer of byte flags,
39 * each indicating whether the corresponding offset is in the list.
40 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
41 * physically moving part of the list.
42 *
43 * Note: In principle, the caller should setMaxLength() to the maximum of the
44 * max string length and U16_LENGTH/U8_LENGTH to account for
45 * "long" single code points.
46 * However, this implementation uses at least a staticList with more than
47 * U8_LENGTH entries anyway.
48 *
49 * Note: If maxLength were guaranteed to be no more than 32 or 64,
50 * the list could be stored as bit flags in a single integer.
51 * Rather than handling a circular buffer with a start list index,
52 * the integer would simply be shifted when lower offsets are removed.
53 * UnicodeSet does not have a limit on the lengths of strings.
54 */
55class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
56public:
57    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
58
59    ~OffsetList() {
60        if(list!=staticList) {
61            uprv_free(list);
62        }
63    }
64
65    // Call exactly once if the list is to be used.
66    void setMaxLength(int32_t maxLength) {
67        if(maxLength<=(int32_t)sizeof(staticList)) {
68            capacity=(int32_t)sizeof(staticList);
69        } else {
70            UBool *l=(UBool *)uprv_malloc(maxLength);
71            if(l!=NULL) {
72                list=l;
73                capacity=maxLength;
74            }
75        }
76        uprv_memset(list, 0, capacity);
77    }
78
79    void clear() {
80        uprv_memset(list, 0, capacity);
81        start=length=0;
82    }
83
84    UBool isEmpty() const {
85        return (UBool)(length==0);
86    }
87
88    // Reduce all stored offsets by delta, used when the current position
89    // moves by delta.
90    // There must not be any offsets lower than delta.
91    // If there is an offset equal to delta, it is removed.
92    // delta=[1..maxLength]
93    void shift(int32_t delta) {
94        int32_t i=start+delta;
95        if(i>=capacity) {
96            i-=capacity;
97        }
98        if(list[i]) {
99            list[i]=FALSE;
100            --length;
101        }
102        start=i;
103    }
104
105    // Add an offset. The list must not contain it yet.
106    // offset=[1..maxLength]
107    void addOffset(int32_t offset) {
108        int32_t i=start+offset;
109        if(i>=capacity) {
110            i-=capacity;
111        }
112        list[i]=TRUE;
113        ++length;
114    }
115
116    // offset=[1..maxLength]
117    UBool containsOffset(int32_t offset) const {
118        int32_t i=start+offset;
119        if(i>=capacity) {
120            i-=capacity;
121        }
122        return list[i];
123    }
124
125    // Find the lowest stored offset from a non-empty list, remove it,
126    // and reduce all other offsets by this minimum.
127    // Returns [1..maxLength].
128    int32_t popMinimum() {
129        // Look for the next offset in list[start+1..capacity-1].
130        int32_t i=start, result;
131        while(++i<capacity) {
132            if(list[i]) {
133                list[i]=FALSE;
134                --length;
135                result=i-start;
136                start=i;
137                return result;
138            }
139        }
140        // i==capacity
141
142        // Wrap around and look for the next offset in list[0..start].
143        // Since the list is not empty, there will be one.
144        result=capacity-start;
145        i=0;
146        while(!list[i]) {
147            ++i;
148        }
149        list[i]=FALSE;
150        --length;
151        start=i;
152        return result+=i;
153    }
154
155private:
156    UBool *list;
157    int32_t capacity;
158    int32_t length;
159    int32_t start;
160
161    UBool staticList[16];
162};
163
164// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
165static int32_t
166getUTF8Length(const UChar *s, int32_t length) {
167    UErrorCode errorCode=U_ZERO_ERROR;
168    int32_t length8=0;
169    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
170    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
171        return length8;
172    } else {
173        // The string contains an unpaired surrogate.
174        // Ignore this string.
175        return 0;
176    }
177}
178
179// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
180static int32_t
181appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
182    UErrorCode errorCode=U_ZERO_ERROR;
183    int32_t length8=0;
184    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
185    if(U_SUCCESS(errorCode)) {
186        return length8;
187    } else {
188        // The string contains an unpaired surrogate.
189        // Ignore this string.
190        return 0;
191    }
192}
193
194static inline uint8_t
195makeSpanLengthByte(int32_t spanLength) {
196    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
197    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
198}
199
200// Construct for all variants of span(), or only for any one variant.
201// Initialize as little as possible, for single use.
202UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
203                                           const UVector &setStrings,
204                                           uint32_t which)
205        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
206          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
207          utf8Length(0),
208          maxLength16(0), maxLength8(0),
209          all((UBool)(which==ALL)) {
210    spanSet.retainAll(set);
211    if(which&NOT_CONTAINED) {
212        // Default to the same sets.
213        // addToSpanNotSet() will create a separate set if necessary.
214        pSpanNotSet=&spanSet;
215    }
216
217    // Determine if the strings even need to be taken into account at all for span() etc.
218    // If any string is relevant, then all strings need to be used for
219    // span(longest match) but only the relevant ones for span(while contained).
220    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
221    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
222    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
223    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
224    int32_t stringsLength=strings.size();
225
226    int32_t i, spanLength;
227    UBool someRelevant=FALSE;
228    for(i=0; i<stringsLength; ++i) {
229        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
230        const UChar *s16=string.getBuffer();
231        int32_t length16=string.length();
232        UBool thisRelevant;
233        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
234        if(spanLength<length16) {  // Relevant string.
235            someRelevant=thisRelevant=TRUE;
236        } else {
237            thisRelevant=FALSE;
238        }
239        if((which&UTF16) && length16>maxLength16) {
240            maxLength16=length16;
241        }
242        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
243            int32_t length8=getUTF8Length(s16, length16);
244            utf8Length+=length8;
245            if(length8>maxLength8) {
246                maxLength8=length8;
247            }
248        }
249    }
250    if(!someRelevant) {
251        maxLength16=maxLength8=0;
252        return;
253    }
254
255    // Freeze after checking for the need to use strings at all because freezing
256    // a set takes some time and memory which are wasted if there are no relevant strings.
257    if(all) {
258        spanSet.freeze();
259    }
260
261    uint8_t *spanBackLengths;
262    uint8_t *spanUTF8Lengths;
263    uint8_t *spanBackUTF8Lengths;
264
265    // Allocate a block of meta data.
266    int32_t allocSize;
267    if(all) {
268        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
269        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
270    } else {
271        allocSize=stringsLength;  // One set of span lengths.
272        if(which&UTF8) {
273            // UTF-8 lengths and UTF-8 strings.
274            allocSize+=stringsLength*4+utf8Length;
275        }
276    }
277    if(allocSize<=(int32_t)sizeof(staticLengths)) {
278        utf8Lengths=staticLengths;
279    } else {
280        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
281        if(utf8Lengths==NULL) {
282            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
283            return;  // Out of memory.
284        }
285    }
286
287    if(all) {
288        // Store span lengths for all span() variants.
289        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
290        spanBackLengths=spanLengths+stringsLength;
291        spanUTF8Lengths=spanBackLengths+stringsLength;
292        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
293        utf8=spanBackUTF8Lengths+stringsLength;
294    } else {
295        // Store span lengths for only one span() variant.
296        if(which&UTF8) {
297            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
298            utf8=spanLengths+stringsLength;
299        } else {
300            spanLengths=(uint8_t *)utf8Lengths;
301        }
302        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
303    }
304
305    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
306    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
307
308    for(i=0; i<stringsLength; ++i) {
309        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
310        const UChar *s16=string.getBuffer();
311        int32_t length16=string.length();
312        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
313        if(spanLength<length16) {  // Relevant string.
314            if(which&UTF16) {
315                if(which&CONTAINED) {
316                    if(which&FWD) {
317                        spanLengths[i]=makeSpanLengthByte(spanLength);
318                    }
319                    if(which&BACK) {
320                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
321                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
322                    }
323                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
324                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
325                }
326            }
327            if(which&UTF8) {
328                uint8_t *s8=utf8+utf8Count;
329                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
330                utf8Count+=utf8Lengths[i]=length8;
331                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
332                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
333                } else {  // Relevant for UTF-8.
334                    if(which&CONTAINED) {
335                        if(which&FWD) {
336                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
337                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
338                        }
339                        if(which&BACK) {
340                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
341                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
342                        }
343                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
344                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
345                    }
346                }
347            }
348            if(which&NOT_CONTAINED) {
349                // Add string start and end code points to the spanNotSet so that
350                // a span(while not contained) stops before any string.
351                UChar32 c;
352                if(which&FWD) {
353                    int32_t len=0;
354                    U16_NEXT(s16, len, length16, c);
355                    addToSpanNotSet(c);
356                }
357                if(which&BACK) {
358                    int32_t len=length16;
359                    U16_PREV(s16, 0, len, c);
360                    addToSpanNotSet(c);
361                }
362            }
363        } else {  // Irrelevant string.
364            if(which&UTF8) {
365                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
366                    uint8_t *s8=utf8+utf8Count;
367                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
368                    utf8Count+=utf8Lengths[i]=length8;
369                } else {
370                    utf8Lengths[i]=0;
371                }
372            }
373            if(all) {
374                spanLengths[i]=spanBackLengths[i]=
375                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
376                        (uint8_t)ALL_CP_CONTAINED;
377            } else {
378                // All spanXYZLengths pointers contain the same address.
379                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
380            }
381        }
382    }
383
384    // Finish.
385    if(all) {
386        pSpanNotSet->freeze();
387    }
388}
389
390// Copy constructor. Assumes which==ALL for a frozen set.
391UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
392                                           const UVector &newParentSetStrings)
393        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
394          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
395          utf8Length(otherStringSpan.utf8Length),
396          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
397          all(TRUE) {
398    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
399        pSpanNotSet=&spanSet;
400    } else {
401        pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
402    }
403
404    // Allocate a block of meta data.
405    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
406    int32_t stringsLength=strings.size();
407    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
408    if(allocSize<=(int32_t)sizeof(staticLengths)) {
409        utf8Lengths=staticLengths;
410    } else {
411        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
412        if(utf8Lengths==NULL) {
413            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
414            return;  // Out of memory.
415        }
416    }
417
418    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
419    utf8=spanLengths+stringsLength*4;
420    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
421}
422
423UnicodeSetStringSpan::~UnicodeSetStringSpan() {
424    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
425        delete pSpanNotSet;
426    }
427    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
428        uprv_free(utf8Lengths);
429    }
430}
431
432void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
433    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
434        if(spanSet.contains(c)) {
435            return;  // Nothing to do.
436        }
437        UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
438        if(newSet==NULL) {
439            return;  // Out of memory.
440        } else {
441            pSpanNotSet=newSet;
442        }
443    }
444    pSpanNotSet->add(c);
445}
446
447// Compare strings without any argument checks. Requires length>0.
448static inline UBool
449matches16(const UChar *s, const UChar *t, int32_t length) {
450    do {
451        if(*s++!=*t++) {
452            return FALSE;
453        }
454    } while(--length>0);
455    return TRUE;
456}
457
458static inline UBool
459matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
460    do {
461        if(*s++!=*t++) {
462            return FALSE;
463        }
464    } while(--length>0);
465    return TRUE;
466}
467
468// Compare 16-bit Unicode strings (which may be malformed UTF-16)
469// at code point boundaries.
470// That is, each edge of a match must not be in the middle of a surrogate pair.
471static inline UBool
472matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
473    s+=start;
474    limit-=start;
475    return matches16(s, t, length) &&
476           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
477           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
478}
479
480// Does the set contain the next code point?
481// If so, return its length; otherwise return its negative length.
482static inline int32_t
483spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
484    UChar c=*s, c2;
485    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
486        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
487    }
488    return set.contains(c) ? 1 : -1;
489}
490
491static inline int32_t
492spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
493    UChar c=s[length-1], c2;
494    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
495        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
496    }
497    return set.contains(c) ? 1 : -1;
498}
499
500static inline int32_t
501spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
502    UChar32 c=*s;
503    if((int8_t)c>=0) {
504        return set.contains(c) ? 1 : -1;
505    }
506    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
507    int32_t i=0;
508    U8_NEXT_OR_FFFD(s, i, length, c);
509    return set.contains(c) ? i : -i;
510}
511
512static inline int32_t
513spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
514    UChar32 c=s[length-1];
515    if((int8_t)c>=0) {
516        return set.contains(c) ? 1 : -1;
517    }
518    int32_t i=length-1;
519    c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
520    length-=i;
521    return set.contains(c) ? length : -length;
522}
523
524/*
525 * Note: In span() when spanLength==0 (after a string match, or at the beginning
526 * after an empty code point span) and in spanNot() and spanNotUTF8(),
527 * string matching could use a binary search
528 * because all string matches are done from the same start index.
529 *
530 * For UTF-8, this would require a comparison function that returns UTF-16 order.
531 *
532 * This optimization should not be necessary for normal UnicodeSets because
533 * most sets have no strings, and most sets with strings have
534 * very few very short strings.
535 * For cases with many strings, it might be better to use a different API
536 * and implementation with a DFA (state machine).
537 */
538
539/*
540 * Algorithm for span(USET_SPAN_CONTAINED)
541 *
542 * Theoretical algorithm:
543 * - Iterate through the string, and at each code point boundary:
544 *   + If the code point there is in the set, then remember to continue after it.
545 *   + If a set string matches at the current position, then remember to continue after it.
546 *   + Either recursively span for each code point or string match,
547 *     or recursively span for all but the shortest one and
548 *     iteratively continue the span with the shortest local match.
549 *   + Remember the longest recursive span (the farthest end point).
550 *   + If there is no match at the current position, neither for the code point there
551 *     nor for any set string, then stop and return the longest recursive span length.
552 *
553 * Optimized implementation:
554 *
555 * (We assume that most sets will have very few very short strings.
556 * A span using a string-less set is extremely fast.)
557 *
558 * Create and cache a spanSet which contains all of the single code points
559 * of the original set but none of its strings.
560 *
561 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
562 * - Loop:
563 *   + Try to match each set string at the end of the spanLength.
564 *     ~ Set strings that start with set-contained code points must be matched
565 *       with a partial overlap because the recursive algorithm would have tried
566 *       to match them at every position.
567 *     ~ Set strings that entirely consist of set-contained code points
568 *       are irrelevant for span(USET_SPAN_CONTAINED) because the
569 *       recursive algorithm would continue after them anyway
570 *       and find the longest recursive match from their end.
571 *     ~ Rather than recursing, note each end point of a set string match.
572 *   + If no set string matched after spanSet.span(), then return
573 *     with where the spanSet.span() ended.
574 *   + If at least one set string matched after spanSet.span(), then
575 *     pop the shortest string match end point and continue
576 *     the loop, trying to match all set strings from there.
577 *   + If at least one more set string matched after a previous string match,
578 *     then test if the code point after the previous string match is also
579 *     contained in the set.
580 *     Continue the loop with the shortest end point of either this code point
581 *     or a matching set string.
582 *   + If no more set string matched after a previous string match,
583 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
584 *     Stop if spanLength==0, otherwise continue the loop.
585 *
586 * By noting each end point of a set string match,
587 * the function visits each string position at most once and finishes
588 * in linear time.
589 *
590 * The recursive algorithm may visit the same string position many times
591 * if multiple paths lead to it and finishes in exponential time.
592 */
593
594/*
595 * Algorithm for span(USET_SPAN_SIMPLE)
596 *
597 * Theoretical algorithm:
598 * - Iterate through the string, and at each code point boundary:
599 *   + If the code point there is in the set, then remember to continue after it.
600 *   + If a set string matches at the current position, then remember to continue after it.
601 *   + Continue from the farthest match position and ignore all others.
602 *   + If there is no match at the current position,
603 *     then stop and return the current position.
604 *
605 * Optimized implementation:
606 *
607 * (Same assumption and spanSet as above.)
608 *
609 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
610 * - Loop:
611 *   + Try to match each set string at the end of the spanLength.
612 *     ~ Set strings that start with set-contained code points must be matched
613 *       with a partial overlap because the standard algorithm would have tried
614 *       to match them earlier.
615 *     ~ Set strings that entirely consist of set-contained code points
616 *       must be matched with a full overlap because the longest-match algorithm
617 *       would hide set string matches that end earlier.
618 *       Such set strings need not be matched earlier inside the code point span
619 *       because the standard algorithm would then have continued after
620 *       the set string match anyway.
621 *     ~ Remember the longest set string match (farthest end point) from the earliest
622 *       starting point.
623 *   + If no set string matched after spanSet.span(), then return
624 *     with where the spanSet.span() ended.
625 *   + If at least one set string matched, then continue the loop after the
626 *     longest match from the earliest position.
627 *   + If no more set string matched after a previous string match,
628 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
629 *     Stop if spanLength==0, otherwise continue the loop.
630 */
631
632int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
633    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
634        return spanNot(s, length);
635    }
636    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
637    if(spanLength==length) {
638        return length;
639    }
640
641    // Consider strings; they may overlap with the span.
642    OffsetList offsets;
643    if(spanCondition==USET_SPAN_CONTAINED) {
644        // Use offset list to try all possibilities.
645        offsets.setMaxLength(maxLength16);
646    }
647    int32_t pos=spanLength, rest=length-pos;
648    int32_t i, stringsLength=strings.size();
649    for(;;) {
650        if(spanCondition==USET_SPAN_CONTAINED) {
651            for(i=0; i<stringsLength; ++i) {
652                int32_t overlap=spanLengths[i];
653                if(overlap==ALL_CP_CONTAINED) {
654                    continue;  // Irrelevant string.
655                }
656                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
657                const UChar *s16=string.getBuffer();
658                int32_t length16=string.length();
659
660                // Try to match this string at pos-overlap..pos.
661                if(overlap>=LONG_SPAN) {
662                    overlap=length16;
663                    // While contained: No point matching fully inside the code point span.
664                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
665                }
666                if(overlap>spanLength) {
667                    overlap=spanLength;
668                }
669                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
670                for(;;) {
671                    if(inc>rest) {
672                        break;
673                    }
674                    // Try to match if the increment is not listed already.
675                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
676                        if(inc==rest) {
677                            return length;  // Reached the end of the string.
678                        }
679                        offsets.addOffset(inc);
680                    }
681                    if(overlap==0) {
682                        break;
683                    }
684                    --overlap;
685                    ++inc;
686                }
687            }
688        } else /* USET_SPAN_SIMPLE */ {
689            int32_t maxInc=0, maxOverlap=0;
690            for(i=0; i<stringsLength; ++i) {
691                int32_t overlap=spanLengths[i];
692                // For longest match, we do need to try to match even an all-contained string
693                // to find the match from the earliest start.
694
695                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
696                const UChar *s16=string.getBuffer();
697                int32_t length16=string.length();
698
699                // Try to match this string at pos-overlap..pos.
700                if(overlap>=LONG_SPAN) {
701                    overlap=length16;
702                    // Longest match: Need to match fully inside the code point span
703                    // to find the match from the earliest start.
704                }
705                if(overlap>spanLength) {
706                    overlap=spanLength;
707                }
708                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
709                for(;;) {
710                    if(inc>rest || overlap<maxOverlap) {
711                        break;
712                    }
713                    // Try to match if the string is longer or starts earlier.
714                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
715                        matches16CPB(s, pos-overlap, length, s16, length16)
716                    ) {
717                        maxInc=inc;  // Longest match from earliest start.
718                        maxOverlap=overlap;
719                        break;
720                    }
721                    --overlap;
722                    ++inc;
723                }
724            }
725
726            if(maxInc!=0 || maxOverlap!=0) {
727                // Longest-match algorithm, and there was a string match.
728                // Simply continue after it.
729                pos+=maxInc;
730                rest-=maxInc;
731                if(rest==0) {
732                    return length;  // Reached the end of the string.
733                }
734                spanLength=0;  // Match strings from after a string match.
735                continue;
736            }
737        }
738        // Finished trying to match all strings at pos.
739
740        if(spanLength!=0 || pos==0) {
741            // The position is after an unlimited code point span (spanLength!=0),
742            // not after a string match.
743            // The only position where spanLength==0 after a span is pos==0.
744            // Otherwise, an unlimited code point span is only tried again when no
745            // strings match, and if such a non-initial span fails we stop.
746            if(offsets.isEmpty()) {
747                return pos;  // No strings matched after a span.
748            }
749            // Match strings from after the next string match.
750        } else {
751            // The position is after a string match (or a single code point).
752            if(offsets.isEmpty()) {
753                // No more strings matched after a previous string match.
754                // Try another code point span from after the last string match.
755                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
756                if( spanLength==rest || // Reached the end of the string, or
757                    spanLength==0       // neither strings nor span progressed.
758                ) {
759                    return pos+spanLength;
760                }
761                pos+=spanLength;
762                rest-=spanLength;
763                continue;  // spanLength>0: Match strings from after a span.
764            } else {
765                // Try to match only one code point from after a string match if some
766                // string matched beyond it, so that we try all possible positions
767                // and don't overshoot.
768                spanLength=spanOne(spanSet, s+pos, rest);
769                if(spanLength>0) {
770                    if(spanLength==rest) {
771                        return length;  // Reached the end of the string.
772                    }
773                    // Match strings after this code point.
774                    // There cannot be any increments below it because UnicodeSet strings
775                    // contain multiple code points.
776                    pos+=spanLength;
777                    rest-=spanLength;
778                    offsets.shift(spanLength);
779                    spanLength=0;
780                    continue;  // Match strings from after a single code point.
781                }
782                // Match strings from after the next string match.
783            }
784        }
785        int32_t minOffset=offsets.popMinimum();
786        pos+=minOffset;
787        rest-=minOffset;
788        spanLength=0;  // Match strings from after a string match.
789    }
790}
791
792int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
793    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
794        return spanNotBack(s, length);
795    }
796    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
797    if(pos==0) {
798        return 0;
799    }
800    int32_t spanLength=length-pos;
801
802    // Consider strings; they may overlap with the span.
803    OffsetList offsets;
804    if(spanCondition==USET_SPAN_CONTAINED) {
805        // Use offset list to try all possibilities.
806        offsets.setMaxLength(maxLength16);
807    }
808    int32_t i, stringsLength=strings.size();
809    uint8_t *spanBackLengths=spanLengths;
810    if(all) {
811        spanBackLengths+=stringsLength;
812    }
813    for(;;) {
814        if(spanCondition==USET_SPAN_CONTAINED) {
815            for(i=0; i<stringsLength; ++i) {
816                int32_t overlap=spanBackLengths[i];
817                if(overlap==ALL_CP_CONTAINED) {
818                    continue;  // Irrelevant string.
819                }
820                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
821                const UChar *s16=string.getBuffer();
822                int32_t length16=string.length();
823
824                // Try to match this string at pos-(length16-overlap)..pos-length16.
825                if(overlap>=LONG_SPAN) {
826                    overlap=length16;
827                    // While contained: No point matching fully inside the code point span.
828                    int32_t len1=0;
829                    U16_FWD_1(s16, len1, overlap);
830                    overlap-=len1;  // Length of the string minus the first code point.
831                }
832                if(overlap>spanLength) {
833                    overlap=spanLength;
834                }
835                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
836                for(;;) {
837                    if(dec>pos) {
838                        break;
839                    }
840                    // Try to match if the decrement is not listed already.
841                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
842                        if(dec==pos) {
843                            return 0;  // Reached the start of the string.
844                        }
845                        offsets.addOffset(dec);
846                    }
847                    if(overlap==0) {
848                        break;
849                    }
850                    --overlap;
851                    ++dec;
852                }
853            }
854        } else /* USET_SPAN_SIMPLE */ {
855            int32_t maxDec=0, maxOverlap=0;
856            for(i=0; i<stringsLength; ++i) {
857                int32_t overlap=spanBackLengths[i];
858                // For longest match, we do need to try to match even an all-contained string
859                // to find the match from the latest end.
860
861                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
862                const UChar *s16=string.getBuffer();
863                int32_t length16=string.length();
864
865                // Try to match this string at pos-(length16-overlap)..pos-length16.
866                if(overlap>=LONG_SPAN) {
867                    overlap=length16;
868                    // Longest match: Need to match fully inside the code point span
869                    // to find the match from the latest end.
870                }
871                if(overlap>spanLength) {
872                    overlap=spanLength;
873                }
874                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
875                for(;;) {
876                    if(dec>pos || overlap<maxOverlap) {
877                        break;
878                    }
879                    // Try to match if the string is longer or ends later.
880                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
881                        matches16CPB(s, pos-dec, length, s16, length16)
882                    ) {
883                        maxDec=dec;  // Longest match from latest end.
884                        maxOverlap=overlap;
885                        break;
886                    }
887                    --overlap;
888                    ++dec;
889                }
890            }
891
892            if(maxDec!=0 || maxOverlap!=0) {
893                // Longest-match algorithm, and there was a string match.
894                // Simply continue before it.
895                pos-=maxDec;
896                if(pos==0) {
897                    return 0;  // Reached the start of the string.
898                }
899                spanLength=0;  // Match strings from before a string match.
900                continue;
901            }
902        }
903        // Finished trying to match all strings at pos.
904
905        if(spanLength!=0 || pos==length) {
906            // The position is before an unlimited code point span (spanLength!=0),
907            // not before a string match.
908            // The only position where spanLength==0 before a span is pos==length.
909            // Otherwise, an unlimited code point span is only tried again when no
910            // strings match, and if such a non-initial span fails we stop.
911            if(offsets.isEmpty()) {
912                return pos;  // No strings matched before a span.
913            }
914            // Match strings from before the next string match.
915        } else {
916            // The position is before a string match (or a single code point).
917            if(offsets.isEmpty()) {
918                // No more strings matched before a previous string match.
919                // Try another code point span from before the last string match.
920                int32_t oldPos=pos;
921                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
922                spanLength=oldPos-pos;
923                if( pos==0 ||           // Reached the start of the string, or
924                    spanLength==0       // neither strings nor span progressed.
925                ) {
926                    return pos;
927                }
928                continue;  // spanLength>0: Match strings from before a span.
929            } else {
930                // Try to match only one code point from before a string match if some
931                // string matched beyond it, so that we try all possible positions
932                // and don't overshoot.
933                spanLength=spanOneBack(spanSet, s, pos);
934                if(spanLength>0) {
935                    if(spanLength==pos) {
936                        return 0;  // Reached the start of the string.
937                    }
938                    // Match strings before this code point.
939                    // There cannot be any decrements below it because UnicodeSet strings
940                    // contain multiple code points.
941                    pos-=spanLength;
942                    offsets.shift(spanLength);
943                    spanLength=0;
944                    continue;  // Match strings from before a single code point.
945                }
946                // Match strings from before the next string match.
947            }
948        }
949        pos-=offsets.popMinimum();
950        spanLength=0;  // Match strings from before a string match.
951    }
952}
953
954int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
955    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
956        return spanNotUTF8(s, length);
957    }
958    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
959    if(spanLength==length) {
960        return length;
961    }
962
963    // Consider strings; they may overlap with the span.
964    OffsetList offsets;
965    if(spanCondition==USET_SPAN_CONTAINED) {
966        // Use offset list to try all possibilities.
967        offsets.setMaxLength(maxLength8);
968    }
969    int32_t pos=spanLength, rest=length-pos;
970    int32_t i, stringsLength=strings.size();
971    uint8_t *spanUTF8Lengths=spanLengths;
972    if(all) {
973        spanUTF8Lengths+=2*stringsLength;
974    }
975    for(;;) {
976        const uint8_t *s8=utf8;
977        int32_t length8;
978        if(spanCondition==USET_SPAN_CONTAINED) {
979            for(i=0; i<stringsLength; ++i) {
980                length8=utf8Lengths[i];
981                if(length8==0) {
982                    continue;  // String not representable in UTF-8.
983                }
984                int32_t overlap=spanUTF8Lengths[i];
985                if(overlap==ALL_CP_CONTAINED) {
986                    s8+=length8;
987                    continue;  // Irrelevant string.
988                }
989
990                // Try to match this string at pos-overlap..pos.
991                if(overlap>=LONG_SPAN) {
992                    overlap=length8;
993                    // While contained: No point matching fully inside the code point span.
994                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
995                }
996                if(overlap>spanLength) {
997                    overlap=spanLength;
998                }
999                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1000                for(;;) {
1001                    if(inc>rest) {
1002                        break;
1003                    }
1004                    // Try to match if the increment is not listed already.
1005                    // Match at code point boundaries. (The UTF-8 strings were converted
1006                    // from UTF-16 and are guaranteed to be well-formed.)
1007                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1008                        !offsets.containsOffset(inc) &&
1009                        matches8(s+pos-overlap, s8, length8)
1010
1011                    ) {
1012                        if(inc==rest) {
1013                            return length;  // Reached the end of the string.
1014                        }
1015                        offsets.addOffset(inc);
1016                    }
1017                    if(overlap==0) {
1018                        break;
1019                    }
1020                    --overlap;
1021                    ++inc;
1022                }
1023                s8+=length8;
1024            }
1025        } else /* USET_SPAN_SIMPLE */ {
1026            int32_t maxInc=0, maxOverlap=0;
1027            for(i=0; i<stringsLength; ++i) {
1028                length8=utf8Lengths[i];
1029                if(length8==0) {
1030                    continue;  // String not representable in UTF-8.
1031                }
1032                int32_t overlap=spanUTF8Lengths[i];
1033                // For longest match, we do need to try to match even an all-contained string
1034                // to find the match from the earliest start.
1035
1036                // Try to match this string at pos-overlap..pos.
1037                if(overlap>=LONG_SPAN) {
1038                    overlap=length8;
1039                    // Longest match: Need to match fully inside the code point span
1040                    // to find the match from the earliest start.
1041                }
1042                if(overlap>spanLength) {
1043                    overlap=spanLength;
1044                }
1045                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1046                for(;;) {
1047                    if(inc>rest || overlap<maxOverlap) {
1048                        break;
1049                    }
1050                    // Try to match if the string is longer or starts earlier.
1051                    // Match at code point boundaries. (The UTF-8 strings were converted
1052                    // from UTF-16 and are guaranteed to be well-formed.)
1053                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1054                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1055                        matches8(s+pos-overlap, s8, length8)
1056
1057                    ) {
1058                        maxInc=inc;  // Longest match from earliest start.
1059                        maxOverlap=overlap;
1060                        break;
1061                    }
1062                    --overlap;
1063                    ++inc;
1064                }
1065                s8+=length8;
1066            }
1067
1068            if(maxInc!=0 || maxOverlap!=0) {
1069                // Longest-match algorithm, and there was a string match.
1070                // Simply continue after it.
1071                pos+=maxInc;
1072                rest-=maxInc;
1073                if(rest==0) {
1074                    return length;  // Reached the end of the string.
1075                }
1076                spanLength=0;  // Match strings from after a string match.
1077                continue;
1078            }
1079        }
1080        // Finished trying to match all strings at pos.
1081
1082        if(spanLength!=0 || pos==0) {
1083            // The position is after an unlimited code point span (spanLength!=0),
1084            // not after a string match.
1085            // The only position where spanLength==0 after a span is pos==0.
1086            // Otherwise, an unlimited code point span is only tried again when no
1087            // strings match, and if such a non-initial span fails we stop.
1088            if(offsets.isEmpty()) {
1089                return pos;  // No strings matched after a span.
1090            }
1091            // Match strings from after the next string match.
1092        } else {
1093            // The position is after a string match (or a single code point).
1094            if(offsets.isEmpty()) {
1095                // No more strings matched after a previous string match.
1096                // Try another code point span from after the last string match.
1097                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1098                if( spanLength==rest || // Reached the end of the string, or
1099                    spanLength==0       // neither strings nor span progressed.
1100                ) {
1101                    return pos+spanLength;
1102                }
1103                pos+=spanLength;
1104                rest-=spanLength;
1105                continue;  // spanLength>0: Match strings from after a span.
1106            } else {
1107                // Try to match only one code point from after a string match if some
1108                // string matched beyond it, so that we try all possible positions
1109                // and don't overshoot.
1110                spanLength=spanOneUTF8(spanSet, s+pos, rest);
1111                if(spanLength>0) {
1112                    if(spanLength==rest) {
1113                        return length;  // Reached the end of the string.
1114                    }
1115                    // Match strings after this code point.
1116                    // There cannot be any increments below it because UnicodeSet strings
1117                    // contain multiple code points.
1118                    pos+=spanLength;
1119                    rest-=spanLength;
1120                    offsets.shift(spanLength);
1121                    spanLength=0;
1122                    continue;  // Match strings from after a single code point.
1123                }
1124                // Match strings from after the next string match.
1125            }
1126        }
1127        int32_t minOffset=offsets.popMinimum();
1128        pos+=minOffset;
1129        rest-=minOffset;
1130        spanLength=0;  // Match strings from after a string match.
1131    }
1132}
1133
1134int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1135    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1136        return spanNotBackUTF8(s, length);
1137    }
1138    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1139    if(pos==0) {
1140        return 0;
1141    }
1142    int32_t spanLength=length-pos;
1143
1144    // Consider strings; they may overlap with the span.
1145    OffsetList offsets;
1146    if(spanCondition==USET_SPAN_CONTAINED) {
1147        // Use offset list to try all possibilities.
1148        offsets.setMaxLength(maxLength8);
1149    }
1150    int32_t i, stringsLength=strings.size();
1151    uint8_t *spanBackUTF8Lengths=spanLengths;
1152    if(all) {
1153        spanBackUTF8Lengths+=3*stringsLength;
1154    }
1155    for(;;) {
1156        const uint8_t *s8=utf8;
1157        int32_t length8;
1158        if(spanCondition==USET_SPAN_CONTAINED) {
1159            for(i=0; i<stringsLength; ++i) {
1160                length8=utf8Lengths[i];
1161                if(length8==0) {
1162                    continue;  // String not representable in UTF-8.
1163                }
1164                int32_t overlap=spanBackUTF8Lengths[i];
1165                if(overlap==ALL_CP_CONTAINED) {
1166                    s8+=length8;
1167                    continue;  // Irrelevant string.
1168                }
1169
1170                // Try to match this string at pos-(length8-overlap)..pos-length8.
1171                if(overlap>=LONG_SPAN) {
1172                    overlap=length8;
1173                    // While contained: No point matching fully inside the code point span.
1174                    int32_t len1=0;
1175                    U8_FWD_1(s8, len1, overlap);
1176                    overlap-=len1;  // Length of the string minus the first code point.
1177                }
1178                if(overlap>spanLength) {
1179                    overlap=spanLength;
1180                }
1181                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1182                for(;;) {
1183                    if(dec>pos) {
1184                        break;
1185                    }
1186                    // Try to match if the decrement is not listed already.
1187                    // Match at code point boundaries. (The UTF-8 strings were converted
1188                    // from UTF-16 and are guaranteed to be well-formed.)
1189                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1190                        !offsets.containsOffset(dec) &&
1191                        matches8(s+pos-dec, s8, length8)
1192                    ) {
1193                        if(dec==pos) {
1194                            return 0;  // Reached the start of the string.
1195                        }
1196                        offsets.addOffset(dec);
1197                    }
1198                    if(overlap==0) {
1199                        break;
1200                    }
1201                    --overlap;
1202                    ++dec;
1203                }
1204                s8+=length8;
1205            }
1206        } else /* USET_SPAN_SIMPLE */ {
1207            int32_t maxDec=0, maxOverlap=0;
1208            for(i=0; i<stringsLength; ++i) {
1209                length8=utf8Lengths[i];
1210                if(length8==0) {
1211                    continue;  // String not representable in UTF-8.
1212                }
1213                int32_t overlap=spanBackUTF8Lengths[i];
1214                // For longest match, we do need to try to match even an all-contained string
1215                // to find the match from the latest end.
1216
1217                // Try to match this string at pos-(length8-overlap)..pos-length8.
1218                if(overlap>=LONG_SPAN) {
1219                    overlap=length8;
1220                    // Longest match: Need to match fully inside the code point span
1221                    // to find the match from the latest end.
1222                }
1223                if(overlap>spanLength) {
1224                    overlap=spanLength;
1225                }
1226                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1227                for(;;) {
1228                    if(dec>pos || overlap<maxOverlap) {
1229                        break;
1230                    }
1231                    // Try to match if the string is longer or ends later.
1232                    // Match at code point boundaries. (The UTF-8 strings were converted
1233                    // from UTF-16 and are guaranteed to be well-formed.)
1234                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1235                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1236                        matches8(s+pos-dec, s8, length8)
1237                    ) {
1238                        maxDec=dec;  // Longest match from latest end.
1239                        maxOverlap=overlap;
1240                        break;
1241                    }
1242                    --overlap;
1243                    ++dec;
1244                }
1245                s8+=length8;
1246            }
1247
1248            if(maxDec!=0 || maxOverlap!=0) {
1249                // Longest-match algorithm, and there was a string match.
1250                // Simply continue before it.
1251                pos-=maxDec;
1252                if(pos==0) {
1253                    return 0;  // Reached the start of the string.
1254                }
1255                spanLength=0;  // Match strings from before a string match.
1256                continue;
1257            }
1258        }
1259        // Finished trying to match all strings at pos.
1260
1261        if(spanLength!=0 || pos==length) {
1262            // The position is before an unlimited code point span (spanLength!=0),
1263            // not before a string match.
1264            // The only position where spanLength==0 before a span is pos==length.
1265            // Otherwise, an unlimited code point span is only tried again when no
1266            // strings match, and if such a non-initial span fails we stop.
1267            if(offsets.isEmpty()) {
1268                return pos;  // No strings matched before a span.
1269            }
1270            // Match strings from before the next string match.
1271        } else {
1272            // The position is before a string match (or a single code point).
1273            if(offsets.isEmpty()) {
1274                // No more strings matched before a previous string match.
1275                // Try another code point span from before the last string match.
1276                int32_t oldPos=pos;
1277                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1278                spanLength=oldPos-pos;
1279                if( pos==0 ||           // Reached the start of the string, or
1280                    spanLength==0       // neither strings nor span progressed.
1281                ) {
1282                    return pos;
1283                }
1284                continue;  // spanLength>0: Match strings from before a span.
1285            } else {
1286                // Try to match only one code point from before a string match if some
1287                // string matched beyond it, so that we try all possible positions
1288                // and don't overshoot.
1289                spanLength=spanOneBackUTF8(spanSet, s, pos);
1290                if(spanLength>0) {
1291                    if(spanLength==pos) {
1292                        return 0;  // Reached the start of the string.
1293                    }
1294                    // Match strings before this code point.
1295                    // There cannot be any decrements below it because UnicodeSet strings
1296                    // contain multiple code points.
1297                    pos-=spanLength;
1298                    offsets.shift(spanLength);
1299                    spanLength=0;
1300                    continue;  // Match strings from before a single code point.
1301                }
1302                // Match strings from before the next string match.
1303            }
1304        }
1305        pos-=offsets.popMinimum();
1306        spanLength=0;  // Match strings from before a string match.
1307    }
1308}
1309
1310/*
1311 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1312 *
1313 * Theoretical algorithm:
1314 * - Iterate through the string, and at each code point boundary:
1315 *   + If the code point there is in the set, then return with the current position.
1316 *   + If a set string matches at the current position, then return with the current position.
1317 *
1318 * Optimized implementation:
1319 *
1320 * (Same assumption as for span() above.)
1321 *
1322 * Create and cache a spanNotSet which contains all of the single code points
1323 * of the original set but none of its strings.
1324 * For each set string add its initial code point to the spanNotSet.
1325 * (Also add its final code point for spanNotBack().)
1326 *
1327 * - Loop:
1328 *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1329 *   + If the current code point is in the original set, then
1330 *     return the current position.
1331 *   + If any set string matches at the current position, then
1332 *     return the current position.
1333 *   + If there is no match at the current position, neither for the code point there
1334 *     nor for any set string, then skip this code point and continue the loop.
1335 *     This happens for set-string-initial code points that were added to spanNotSet
1336 *     when there is not actually a match for such a set string.
1337 */
1338
1339int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1340    int32_t pos=0, rest=length;
1341    int32_t i, stringsLength=strings.size();
1342    do {
1343        // Span until we find a code point from the set,
1344        // or a code point that starts or ends some string.
1345        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1346        if(i==rest) {
1347            return length;  // Reached the end of the string.
1348        }
1349        pos+=i;
1350        rest-=i;
1351
1352        // Check whether the current code point is in the original set,
1353        // without the string starts and ends.
1354        int32_t cpLength=spanOne(spanSet, s+pos, rest);
1355        if(cpLength>0) {
1356            return pos;  // There is a set element at pos.
1357        }
1358
1359        // Try to match the strings at pos.
1360        for(i=0; i<stringsLength; ++i) {
1361            if(spanLengths[i]==ALL_CP_CONTAINED) {
1362                continue;  // Irrelevant string.
1363            }
1364            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1365            const UChar *s16=string.getBuffer();
1366            int32_t length16=string.length();
1367            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1368                return pos;  // There is a set element at pos.
1369            }
1370        }
1371
1372        // The span(while not contained) ended on a string start/end which is
1373        // not in the original set. Skip this code point and continue.
1374        // cpLength<0
1375        pos-=cpLength;
1376        rest+=cpLength;
1377    } while(rest!=0);
1378    return length;  // Reached the end of the string.
1379}
1380
1381int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1382    int32_t pos=length;
1383    int32_t i, stringsLength=strings.size();
1384    do {
1385        // Span until we find a code point from the set,
1386        // or a code point that starts or ends some string.
1387        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1388        if(pos==0) {
1389            return 0;  // Reached the start of the string.
1390        }
1391
1392        // Check whether the current code point is in the original set,
1393        // without the string starts and ends.
1394        int32_t cpLength=spanOneBack(spanSet, s, pos);
1395        if(cpLength>0) {
1396            return pos;  // There is a set element at pos.
1397        }
1398
1399        // Try to match the strings at pos.
1400        for(i=0; i<stringsLength; ++i) {
1401            // Use spanLengths rather than a spanBackLengths pointer because
1402            // it is easier and we only need to know whether the string is irrelevant
1403            // which is the same in either array.
1404            if(spanLengths[i]==ALL_CP_CONTAINED) {
1405                continue;  // Irrelevant string.
1406            }
1407            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1408            const UChar *s16=string.getBuffer();
1409            int32_t length16=string.length();
1410            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1411                return pos;  // There is a set element at pos.
1412            }
1413        }
1414
1415        // The span(while not contained) ended on a string start/end which is
1416        // not in the original set. Skip this code point and continue.
1417        // cpLength<0
1418        pos+=cpLength;
1419    } while(pos!=0);
1420    return 0;  // Reached the start of the string.
1421}
1422
1423int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1424    int32_t pos=0, rest=length;
1425    int32_t i, stringsLength=strings.size();
1426    uint8_t *spanUTF8Lengths=spanLengths;
1427    if(all) {
1428        spanUTF8Lengths+=2*stringsLength;
1429    }
1430    do {
1431        // Span until we find a code point from the set,
1432        // or a code point that starts or ends some string.
1433        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1434        if(i==rest) {
1435            return length;  // Reached the end of the string.
1436        }
1437        pos+=i;
1438        rest-=i;
1439
1440        // Check whether the current code point is in the original set,
1441        // without the string starts and ends.
1442        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1443        if(cpLength>0) {
1444            return pos;  // There is a set element at pos.
1445        }
1446
1447        // Try to match the strings at pos.
1448        const uint8_t *s8=utf8;
1449        int32_t length8;
1450        for(i=0; i<stringsLength; ++i) {
1451            length8=utf8Lengths[i];
1452            // ALL_CP_CONTAINED: Irrelevant string.
1453            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1454                return pos;  // There is a set element at pos.
1455            }
1456            s8+=length8;
1457        }
1458
1459        // The span(while not contained) ended on a string start/end which is
1460        // not in the original set. Skip this code point and continue.
1461        // cpLength<0
1462        pos-=cpLength;
1463        rest+=cpLength;
1464    } while(rest!=0);
1465    return length;  // Reached the end of the string.
1466}
1467
1468int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1469    int32_t pos=length;
1470    int32_t i, stringsLength=strings.size();
1471    uint8_t *spanBackUTF8Lengths=spanLengths;
1472    if(all) {
1473        spanBackUTF8Lengths+=3*stringsLength;
1474    }
1475    do {
1476        // Span until we find a code point from the set,
1477        // or a code point that starts or ends some string.
1478        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1479        if(pos==0) {
1480            return 0;  // Reached the start of the string.
1481        }
1482
1483        // Check whether the current code point is in the original set,
1484        // without the string starts and ends.
1485        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1486        if(cpLength>0) {
1487            return pos;  // There is a set element at pos.
1488        }
1489
1490        // Try to match the strings at pos.
1491        const uint8_t *s8=utf8;
1492        int32_t length8;
1493        for(i=0; i<stringsLength; ++i) {
1494            length8=utf8Lengths[i];
1495            // ALL_CP_CONTAINED: Irrelevant string.
1496            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1497                return pos;  // There is a set element at pos.
1498            }
1499            s8+=length8;
1500        }
1501
1502        // The span(while not contained) ended on a string start/end which is
1503        // not in the original set. Skip this code point and continue.
1504        // cpLength<0
1505        pos+=cpLength;
1506    } while(pos!=0);
1507    return 0;  // Reached the start of the string.
1508}
1509
1510U_NAMESPACE_END
1511