1/*
2******************************************************************************
3*
4*   Copyright (C) 2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  unisetspan.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2007mar01
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/uniset.h"
19#include "unicode/ustring.h"
20#include "cmemory.h"
21#include "uvector.h"
22#include "unisetspan.h"
23
24U_NAMESPACE_BEGIN
25
26/*
27 * List of offsets from the current position from where to try matching
28 * a code point or a string.
29 * Store offsets rather than indexes to simplify the code and use the same list
30 * for both increments (in span()) and decrements (in spanBack()).
31 *
32 * Assumption: The maximum offset is limited, and the offsets that are stored
33 * at any one time are relatively dense, that is, there are normally no gaps of
34 * hundreds or thousands of offset values.
35 *
36 * The implementation uses a circular buffer of byte flags,
37 * each indicating whether the corresponding offset is in the list.
38 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
39 * physically moving part of the list.
40 *
41 * Note: In principle, the caller should setMaxLength() to the maximum of the
42 * max string length and U16_LENGTH/U8_LENGTH to account for
43 * "long" single code points.
44 * However, this implementation uses at least a staticList with more than
45 * U8_LENGTH entries anyway.
46 *
47 * Note: If maxLength were guaranteed to be no more than 32 or 64,
48 * the list could be stored as bit flags in a single integer.
49 * Rather than handling a circular buffer with a start list index,
50 * the integer would simply be shifted when lower offsets are removed.
51 * UnicodeSet does not have a limit on the lengths of strings.
52 */
53class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
54public:
55    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
56
57    ~OffsetList() {
58        if(list!=staticList) {
59            uprv_free(list);
60        }
61    }
62
63    // Call exactly once if the list is to be used.
64    void setMaxLength(int32_t maxLength) {
65        if(maxLength<=(int32_t)sizeof(staticList)) {
66            capacity=(int32_t)sizeof(staticList);
67        } else {
68            UBool *l=(UBool *)uprv_malloc(maxLength);
69            if(l!=NULL) {
70                list=l;
71                capacity=maxLength;
72            }
73        }
74        uprv_memset(list, 0, capacity);
75    }
76
77    void clear() {
78        uprv_memset(list, 0, capacity);
79        start=length=0;
80    }
81
82    UBool isEmpty() const {
83        return (UBool)(length==0);
84    }
85
86    // Reduce all stored offsets by delta, used when the current position
87    // moves by delta.
88    // There must not be any offsets lower than delta.
89    // If there is an offset equal to delta, it is removed.
90    // delta=[1..maxLength]
91    void shift(int32_t delta) {
92        int32_t i=start+delta;
93        if(i>=capacity) {
94            i-=capacity;
95        }
96        if(list[i]) {
97            list[i]=FALSE;
98            --length;
99        }
100        start=i;
101    }
102
103    // Add an offset. The list must not contain it yet.
104    // offset=[1..maxLength]
105    void addOffset(int32_t offset) {
106        int32_t i=start+offset;
107        if(i>=capacity) {
108            i-=capacity;
109        }
110        list[i]=TRUE;
111        ++length;
112    }
113
114    // offset=[1..maxLength]
115    UBool containsOffset(int32_t offset) const {
116        int32_t i=start+offset;
117        if(i>=capacity) {
118            i-=capacity;
119        }
120        return list[i];
121    }
122
123    // Find the lowest stored offset from a non-empty list, remove it,
124    // and reduce all other offsets by this minimum.
125    // Returns [1..maxLength].
126    int32_t popMinimum() {
127        // Look for the next offset in list[start+1..capacity-1].
128        int32_t i=start, result;
129        while(++i<capacity) {
130            if(list[i]) {
131                list[i]=FALSE;
132                --length;
133                result=i-start;
134                start=i;
135                return result;
136            }
137        }
138        // i==capacity
139
140        // Wrap around and look for the next offset in list[0..start].
141        // Since the list is not empty, there will be one.
142        result=capacity-start;
143        i=0;
144        while(!list[i]) {
145            ++i;
146        }
147        list[i]=FALSE;
148        --length;
149        start=i;
150        return result+=i;
151    }
152
153private:
154    UBool *list;
155    int32_t capacity;
156    int32_t length;
157    int32_t start;
158
159    UBool staticList[16];
160};
161
162// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
163static int32_t
164getUTF8Length(const UChar *s, int32_t length) {
165    UErrorCode errorCode=U_ZERO_ERROR;
166    int32_t length8=0;
167    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
168    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
169        return length8;
170    } else {
171        // The string contains an unpaired surrogate.
172        // Ignore this string.
173        return 0;
174    }
175}
176
177// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
178static int32_t
179appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
180    UErrorCode errorCode=U_ZERO_ERROR;
181    int32_t length8=0;
182    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
183    if(U_SUCCESS(errorCode)) {
184        return length8;
185    } else {
186        // The string contains an unpaired surrogate.
187        // Ignore this string.
188        return 0;
189    }
190}
191
192static inline uint8_t
193makeSpanLengthByte(int32_t spanLength) {
194    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
195    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
196}
197
198// Construct for all variants of span(), or only for any one variant.
199// Initialize as little as possible, for single use.
200UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
201                                           const UVector &setStrings,
202                                           uint32_t which)
203        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
204          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
205          utf8Length(0),
206          maxLength16(0), maxLength8(0),
207          all((UBool)(which==ALL)) {
208    spanSet.retainAll(set);
209    if(which&NOT_CONTAINED) {
210        // Default to the same sets.
211        // addToSpanNotSet() will create a separate set if necessary.
212        pSpanNotSet=&spanSet;
213    }
214
215    // Determine if the strings even need to be taken into account at all for span() etc.
216    // If any string is relevant, then all strings need to be used for
217    // span(longest match) but only the relevant ones for span(while contained).
218    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
219    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
220    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
221    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
222    int32_t stringsLength=strings.size();
223
224    int32_t i, spanLength;
225    UBool someRelevant=FALSE;
226    for(i=0; i<stringsLength; ++i) {
227        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
228        const UChar *s16=string.getBuffer();
229        int32_t length16=string.length();
230        UBool thisRelevant;
231        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
232        if(spanLength<length16) {  // Relevant string.
233            someRelevant=thisRelevant=TRUE;
234        } else {
235            thisRelevant=FALSE;
236        }
237        if((which&UTF16) && length16>maxLength16) {
238            maxLength16=length16;
239        }
240        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
241            int32_t length8=getUTF8Length(s16, length16);
242            utf8Length+=length8;
243            if(length8>maxLength8) {
244                maxLength8=length8;
245            }
246        }
247    }
248    if(!someRelevant) {
249        maxLength16=maxLength8=0;
250        return;
251    }
252
253    // Freeze after checking for the need to use strings at all because freezing
254    // a set takes some time and memory which are wasted if there are no relevant strings.
255    if(all) {
256        spanSet.freeze();
257    }
258
259    uint8_t *spanBackLengths;
260    uint8_t *spanUTF8Lengths;
261    uint8_t *spanBackUTF8Lengths;
262
263    // Allocate a block of meta data.
264    int32_t allocSize;
265    if(all) {
266        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
267        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
268    } else {
269        allocSize=stringsLength;  // One set of span lengths.
270        if(which&UTF8) {
271            // UTF-8 lengths and UTF-8 strings.
272            allocSize+=stringsLength*4+utf8Length;
273        }
274    }
275    if(allocSize<=(int32_t)sizeof(staticLengths)) {
276        utf8Lengths=staticLengths;
277    } else {
278        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
279        if(utf8Lengths==NULL) {
280            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
281            return;  // Out of memory.
282        }
283    }
284
285    if(all) {
286        // Store span lengths for all span() variants.
287        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
288        spanBackLengths=spanLengths+stringsLength;
289        spanUTF8Lengths=spanBackLengths+stringsLength;
290        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
291        utf8=spanBackUTF8Lengths+stringsLength;
292    } else {
293        // Store span lengths for only one span() variant.
294        if(which&UTF8) {
295            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
296            utf8=spanLengths+stringsLength;
297        } else {
298            spanLengths=(uint8_t *)utf8Lengths;
299        }
300        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
301    }
302
303    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
304    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
305
306    for(i=0; i<stringsLength; ++i) {
307        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
308        const UChar *s16=string.getBuffer();
309        int32_t length16=string.length();
310        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
311        if(spanLength<length16) {  // Relevant string.
312            if(which&UTF16) {
313                if(which&CONTAINED) {
314                    if(which&FWD) {
315                        spanLengths[i]=makeSpanLengthByte(spanLength);
316                    }
317                    if(which&BACK) {
318                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
319                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
320                    }
321                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
322                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
323                }
324            }
325            if(which&UTF8) {
326                uint8_t *s8=utf8+utf8Count;
327                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
328                utf8Count+=utf8Lengths[i]=length8;
329                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
330                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
331                } else {  // Relevant for UTF-8.
332                    if(which&CONTAINED) {
333                        if(which&FWD) {
334                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
335                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
336                        }
337                        if(which&BACK) {
338                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340                        }
341                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
342                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
343                    }
344                }
345            }
346            if(which&NOT_CONTAINED) {
347                // Add string start and end code points to the spanNotSet so that
348                // a span(while not contained) stops before any string.
349                UChar32 c;
350                if(which&FWD) {
351                    int32_t len=0;
352                    U16_NEXT(s16, len, length16, c);
353                    addToSpanNotSet(c);
354                }
355                if(which&BACK) {
356                    int32_t len=length16;
357                    U16_PREV(s16, 0, len, c);
358                    addToSpanNotSet(c);
359                }
360            }
361        } else {  // Irrelevant string.
362            if(which&UTF8) {
363                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
364                    uint8_t *s8=utf8+utf8Count;
365                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
366                    utf8Count+=utf8Lengths[i]=length8;
367                } else {
368                    utf8Lengths[i]=0;
369                }
370            }
371            if(all) {
372                spanLengths[i]=spanBackLengths[i]=
373                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
374                        (uint8_t)ALL_CP_CONTAINED;
375            } else {
376                // All spanXYZLengths pointers contain the same address.
377                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
378            }
379        }
380    }
381
382    // Finish.
383    if(all) {
384        pSpanNotSet->freeze();
385    }
386}
387
388// Copy constructor. Assumes which==ALL for a frozen set.
389UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
390                                           const UVector &newParentSetStrings)
391        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
392          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
393          utf8Length(otherStringSpan.utf8Length),
394          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
395          all(TRUE) {
396    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
397        pSpanNotSet=&spanSet;
398    } else {
399        pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
400    }
401
402    // Allocate a block of meta data.
403    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
404    int32_t stringsLength=strings.size();
405    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
406    if(allocSize<=(int32_t)sizeof(staticLengths)) {
407        utf8Lengths=staticLengths;
408    } else {
409        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
410        if(utf8Lengths==NULL) {
411            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
412            return;  // Out of memory.
413        }
414    }
415
416    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
417    utf8=spanLengths+stringsLength*4;
418    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
419}
420
421UnicodeSetStringSpan::~UnicodeSetStringSpan() {
422    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
423        delete pSpanNotSet;
424    }
425    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
426        uprv_free(utf8Lengths);
427    }
428}
429
430void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
431    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
432        if(spanSet.contains(c)) {
433            return;  // Nothing to do.
434        }
435        UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
436        if(newSet==NULL) {
437            return;  // Out of memory.
438        } else {
439            pSpanNotSet=newSet;
440        }
441    }
442    pSpanNotSet->add(c);
443}
444
445// Compare strings without any argument checks. Requires length>0.
446static inline UBool
447matches16(const UChar *s, const UChar *t, int32_t length) {
448    do {
449        if(*s++!=*t++) {
450            return FALSE;
451        }
452    } while(--length>0);
453    return TRUE;
454}
455
456static inline UBool
457matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
458    do {
459        if(*s++!=*t++) {
460            return FALSE;
461        }
462    } while(--length>0);
463    return TRUE;
464}
465
466// Compare 16-bit Unicode strings (which may be malformed UTF-16)
467// at code point boundaries.
468// That is, each edge of a match must not be in the middle of a surrogate pair.
469static inline UBool
470matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
471    s+=start;
472    limit-=start;
473    return matches16(s, t, length) &&
474           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
475           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
476}
477
478// Does the set contain the next code point?
479// If so, return its length; otherwise return its negative length.
480static inline int32_t
481spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
482    UChar c=*s, c2;
483    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
484        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
485    }
486    return set.contains(c) ? 1 : -1;
487}
488
489static inline int32_t
490spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
491    UChar c=s[length-1], c2;
492    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
493        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
494    }
495    return set.contains(c) ? 1 : -1;
496}
497
498static inline int32_t
499spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
500    UChar32 c=*s;
501    if((int8_t)c>=0) {
502        return set.contains(c) ? 1 : -1;
503    }
504    // Take advantage of non-ASCII fastpaths in U8_NEXT().
505    int32_t i=0;
506    U8_NEXT(s, i, length, c);
507    return set.contains(c) ? i : -i;
508}
509
510static inline int32_t
511spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
512    UChar32 c=s[length-1];
513    if((int8_t)c>=0) {
514        return set.contains(c) ? 1 : -1;
515    }
516    int32_t i=length-1;
517    c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
518    length-=i;
519    return set.contains(c) ? length : -length;
520}
521
522/*
523 * Note: In span() when spanLength==0 (after a string match, or at the beginning
524 * after an empty code point span) and in spanNot() and spanNotUTF8(),
525 * string matching could use a binary search
526 * because all string matches are done from the same start index.
527 *
528 * For UTF-8, this would require a comparison function that returns UTF-16 order.
529 *
530 * This optimization should not be necessary for normal UnicodeSets because
531 * most sets have no strings, and most sets with strings have
532 * very few very short strings.
533 * For cases with many strings, it might be better to use a different API
534 * and implementation with a DFA (state machine).
535 */
536
537/*
538 * Algorithm for span(USET_SPAN_CONTAINED)
539 *
540 * Theoretical algorithm:
541 * - Iterate through the string, and at each code point boundary:
542 *   + If the code point there is in the set, then remember to continue after it.
543 *   + If a set string matches at the current position, then remember to continue after it.
544 *   + Either recursively span for each code point or string match,
545 *     or recursively span for all but the shortest one and
546 *     iteratively continue the span with the shortest local match.
547 *   + Remember the longest recursive span (the farthest end point).
548 *   + If there is no match at the current position, neither for the code point there
549 *     nor for any set string, then stop and return the longest recursive span length.
550 *
551 * Optimized implementation:
552 *
553 * (We assume that most sets will have very few very short strings.
554 * A span using a string-less set is extremely fast.)
555 *
556 * Create and cache a spanSet which contains all of the single code points
557 * of the original set but none of its strings.
558 *
559 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
560 * - Loop:
561 *   + Try to match each set string at the end of the spanLength.
562 *     ~ Set strings that start with set-contained code points must be matched
563 *       with a partial overlap because the recursive algorithm would have tried
564 *       to match them at every position.
565 *     ~ Set strings that entirely consist of set-contained code points
566 *       are irrelevant for span(USET_SPAN_CONTAINED) because the
567 *       recursive algorithm would continue after them anyway
568 *       and find the longest recursive match from their end.
569 *     ~ Rather than recursing, note each end point of a set string match.
570 *   + If no set string matched after spanSet.span(), then return
571 *     with where the spanSet.span() ended.
572 *   + If at least one set string matched after spanSet.span(), then
573 *     pop the shortest string match end point and continue
574 *     the loop, trying to match all set strings from there.
575 *   + If at least one more set string matched after a previous string match,
576 *     then test if the code point after the previous string match is also
577 *     contained in the set.
578 *     Continue the loop with the shortest end point of either this code point
579 *     or a matching set string.
580 *   + If no more set string matched after a previous string match,
581 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
582 *     Stop if spanLength==0, otherwise continue the loop.
583 *
584 * By noting each end point of a set string match,
585 * the function visits each string position at most once and finishes
586 * in linear time.
587 *
588 * The recursive algorithm may visit the same string position many times
589 * if multiple paths lead to it and finishes in exponential time.
590 */
591
592/*
593 * Algorithm for span(USET_SPAN_SIMPLE)
594 *
595 * Theoretical algorithm:
596 * - Iterate through the string, and at each code point boundary:
597 *   + If the code point there is in the set, then remember to continue after it.
598 *   + If a set string matches at the current position, then remember to continue after it.
599 *   + Continue from the farthest match position and ignore all others.
600 *   + If there is no match at the current position,
601 *     then stop and return the current position.
602 *
603 * Optimized implementation:
604 *
605 * (Same assumption and spanSet as above.)
606 *
607 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
608 * - Loop:
609 *   + Try to match each set string at the end of the spanLength.
610 *     ~ Set strings that start with set-contained code points must be matched
611 *       with a partial overlap because the standard algorithm would have tried
612 *       to match them earlier.
613 *     ~ Set strings that entirely consist of set-contained code points
614 *       must be matched with a full overlap because the longest-match algorithm
615 *       would hide set string matches that end earlier.
616 *       Such set strings need not be matched earlier inside the code point span
617 *       because the standard algorithm would then have continued after
618 *       the set string match anyway.
619 *     ~ Remember the longest set string match (farthest end point) from the earliest
620 *       starting point.
621 *   + If no set string matched after spanSet.span(), then return
622 *     with where the spanSet.span() ended.
623 *   + If at least one set string matched, then continue the loop after the
624 *     longest match from the earliest position.
625 *   + If no more set string matched after a previous string match,
626 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
627 *     Stop if spanLength==0, otherwise continue the loop.
628 */
629
630int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
631    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
632        return spanNot(s, length);
633    }
634    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
635    if(spanLength==length) {
636        return length;
637    }
638
639    // Consider strings; they may overlap with the span.
640    OffsetList offsets;
641    if(spanCondition==USET_SPAN_CONTAINED) {
642        // Use offset list to try all possibilities.
643        offsets.setMaxLength(maxLength16);
644    }
645    int32_t pos=spanLength, rest=length-pos;
646    int32_t i, stringsLength=strings.size();
647    for(;;) {
648        if(spanCondition==USET_SPAN_CONTAINED) {
649            for(i=0; i<stringsLength; ++i) {
650                int32_t overlap=spanLengths[i];
651                if(overlap==ALL_CP_CONTAINED) {
652                    continue;  // Irrelevant string.
653                }
654                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
655                const UChar *s16=string.getBuffer();
656                int32_t length16=string.length();
657
658                // Try to match this string at pos-overlap..pos.
659                if(overlap>=LONG_SPAN) {
660                    overlap=length16;
661                    // While contained: No point matching fully inside the code point span.
662                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
663                }
664                if(overlap>spanLength) {
665                    overlap=spanLength;
666                }
667                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
668                for(;;) {
669                    if(inc>rest) {
670                        break;
671                    }
672                    // Try to match if the increment is not listed already.
673                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
674                        if(inc==rest) {
675                            return length;  // Reached the end of the string.
676                        }
677                        offsets.addOffset(inc);
678                    }
679                    if(overlap==0) {
680                        break;
681                    }
682                    --overlap;
683                    ++inc;
684                }
685            }
686        } else /* USET_SPAN_SIMPLE */ {
687            int32_t maxInc=0, maxOverlap=0;
688            for(i=0; i<stringsLength; ++i) {
689                int32_t overlap=spanLengths[i];
690                // For longest match, we do need to try to match even an all-contained string
691                // to find the match from the earliest start.
692
693                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
694                const UChar *s16=string.getBuffer();
695                int32_t length16=string.length();
696
697                // Try to match this string at pos-overlap..pos.
698                if(overlap>=LONG_SPAN) {
699                    overlap=length16;
700                    // Longest match: Need to match fully inside the code point span
701                    // to find the match from the earliest start.
702                }
703                if(overlap>spanLength) {
704                    overlap=spanLength;
705                }
706                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
707                for(;;) {
708                    if(inc>rest || overlap<maxOverlap) {
709                        break;
710                    }
711                    // Try to match if the string is longer or starts earlier.
712                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
713                        matches16CPB(s, pos-overlap, length, s16, length16)
714                    ) {
715                        maxInc=inc;  // Longest match from earliest start.
716                        maxOverlap=overlap;
717                        break;
718                    }
719                    --overlap;
720                    ++inc;
721                }
722            }
723
724            if(maxInc!=0 || maxOverlap!=0) {
725                // Longest-match algorithm, and there was a string match.
726                // Simply continue after it.
727                pos+=maxInc;
728                rest-=maxInc;
729                if(rest==0) {
730                    return length;  // Reached the end of the string.
731                }
732                spanLength=0;  // Match strings from after a string match.
733                continue;
734            }
735        }
736        // Finished trying to match all strings at pos.
737
738        if(spanLength!=0 || pos==0) {
739            // The position is after an unlimited code point span (spanLength!=0),
740            // not after a string match.
741            // The only position where spanLength==0 after a span is pos==0.
742            // Otherwise, an unlimited code point span is only tried again when no
743            // strings match, and if such a non-initial span fails we stop.
744            if(offsets.isEmpty()) {
745                return pos;  // No strings matched after a span.
746            }
747            // Match strings from after the next string match.
748        } else {
749            // The position is after a string match (or a single code point).
750            if(offsets.isEmpty()) {
751                // No more strings matched after a previous string match.
752                // Try another code point span from after the last string match.
753                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
754                if( spanLength==rest || // Reached the end of the string, or
755                    spanLength==0       // neither strings nor span progressed.
756                ) {
757                    return pos+spanLength;
758                }
759                pos+=spanLength;
760                rest-=spanLength;
761                continue;  // spanLength>0: Match strings from after a span.
762            } else {
763                // Try to match only one code point from after a string match if some
764                // string matched beyond it, so that we try all possible positions
765                // and don't overshoot.
766                spanLength=spanOne(spanSet, s+pos, rest);
767                if(spanLength>0) {
768                    if(spanLength==rest) {
769                        return length;  // Reached the end of the string.
770                    }
771                    // Match strings after this code point.
772                    // There cannot be any increments below it because UnicodeSet strings
773                    // contain multiple code points.
774                    pos+=spanLength;
775                    rest-=spanLength;
776                    offsets.shift(spanLength);
777                    spanLength=0;
778                    continue;  // Match strings from after a single code point.
779                }
780                // Match strings from after the next string match.
781            }
782        }
783        int32_t minOffset=offsets.popMinimum();
784        pos+=minOffset;
785        rest-=minOffset;
786        spanLength=0;  // Match strings from after a string match.
787    }
788}
789
790int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
791    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
792        return spanNotBack(s, length);
793    }
794    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
795    if(pos==0) {
796        return 0;
797    }
798    int32_t spanLength=length-pos;
799
800    // Consider strings; they may overlap with the span.
801    OffsetList offsets;
802    if(spanCondition==USET_SPAN_CONTAINED) {
803        // Use offset list to try all possibilities.
804        offsets.setMaxLength(maxLength16);
805    }
806    int32_t i, stringsLength=strings.size();
807    uint8_t *spanBackLengths=spanLengths;
808    if(all) {
809        spanBackLengths+=stringsLength;
810    }
811    for(;;) {
812        if(spanCondition==USET_SPAN_CONTAINED) {
813            for(i=0; i<stringsLength; ++i) {
814                int32_t overlap=spanBackLengths[i];
815                if(overlap==ALL_CP_CONTAINED) {
816                    continue;  // Irrelevant string.
817                }
818                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
819                const UChar *s16=string.getBuffer();
820                int32_t length16=string.length();
821
822                // Try to match this string at pos-(length16-overlap)..pos-length16.
823                if(overlap>=LONG_SPAN) {
824                    overlap=length16;
825                    // While contained: No point matching fully inside the code point span.
826                    int32_t len1=0;
827                    U16_FWD_1(s16, len1, overlap);
828                    overlap-=len1;  // Length of the string minus the first code point.
829                }
830                if(overlap>spanLength) {
831                    overlap=spanLength;
832                }
833                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
834                for(;;) {
835                    if(dec>pos) {
836                        break;
837                    }
838                    // Try to match if the decrement is not listed already.
839                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
840                        if(dec==pos) {
841                            return 0;  // Reached the start of the string.
842                        }
843                        offsets.addOffset(dec);
844                    }
845                    if(overlap==0) {
846                        break;
847                    }
848                    --overlap;
849                    ++dec;
850                }
851            }
852        } else /* USET_SPAN_SIMPLE */ {
853            int32_t maxDec=0, maxOverlap=0;
854            for(i=0; i<stringsLength; ++i) {
855                int32_t overlap=spanBackLengths[i];
856                // For longest match, we do need to try to match even an all-contained string
857                // to find the match from the latest end.
858
859                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
860                const UChar *s16=string.getBuffer();
861                int32_t length16=string.length();
862
863                // Try to match this string at pos-(length16-overlap)..pos-length16.
864                if(overlap>=LONG_SPAN) {
865                    overlap=length16;
866                    // Longest match: Need to match fully inside the code point span
867                    // to find the match from the latest end.
868                }
869                if(overlap>spanLength) {
870                    overlap=spanLength;
871                }
872                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
873                for(;;) {
874                    if(dec>pos || overlap<maxOverlap) {
875                        break;
876                    }
877                    // Try to match if the string is longer or ends later.
878                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
879                        matches16CPB(s, pos-dec, length, s16, length16)
880                    ) {
881                        maxDec=dec;  // Longest match from latest end.
882                        maxOverlap=overlap;
883                        break;
884                    }
885                    --overlap;
886                    ++dec;
887                }
888            }
889
890            if(maxDec!=0 || maxOverlap!=0) {
891                // Longest-match algorithm, and there was a string match.
892                // Simply continue before it.
893                pos-=maxDec;
894                if(pos==0) {
895                    return 0;  // Reached the start of the string.
896                }
897                spanLength=0;  // Match strings from before a string match.
898                continue;
899            }
900        }
901        // Finished trying to match all strings at pos.
902
903        if(spanLength!=0 || pos==length) {
904            // The position is before an unlimited code point span (spanLength!=0),
905            // not before a string match.
906            // The only position where spanLength==0 before a span is pos==length.
907            // Otherwise, an unlimited code point span is only tried again when no
908            // strings match, and if such a non-initial span fails we stop.
909            if(offsets.isEmpty()) {
910                return pos;  // No strings matched before a span.
911            }
912            // Match strings from before the next string match.
913        } else {
914            // The position is before a string match (or a single code point).
915            if(offsets.isEmpty()) {
916                // No more strings matched before a previous string match.
917                // Try another code point span from before the last string match.
918                int32_t oldPos=pos;
919                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
920                spanLength=oldPos-pos;
921                if( pos==0 ||           // Reached the start of the string, or
922                    spanLength==0       // neither strings nor span progressed.
923                ) {
924                    return pos;
925                }
926                continue;  // spanLength>0: Match strings from before a span.
927            } else {
928                // Try to match only one code point from before a string match if some
929                // string matched beyond it, so that we try all possible positions
930                // and don't overshoot.
931                spanLength=spanOneBack(spanSet, s, pos);
932                if(spanLength>0) {
933                    if(spanLength==pos) {
934                        return 0;  // Reached the start of the string.
935                    }
936                    // Match strings before this code point.
937                    // There cannot be any decrements below it because UnicodeSet strings
938                    // contain multiple code points.
939                    pos-=spanLength;
940                    offsets.shift(spanLength);
941                    spanLength=0;
942                    continue;  // Match strings from before a single code point.
943                }
944                // Match strings from before the next string match.
945            }
946        }
947        pos-=offsets.popMinimum();
948        spanLength=0;  // Match strings from before a string match.
949    }
950}
951
952int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
953    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
954        return spanNotUTF8(s, length);
955    }
956    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
957    if(spanLength==length) {
958        return length;
959    }
960
961    // Consider strings; they may overlap with the span.
962    OffsetList offsets;
963    if(spanCondition==USET_SPAN_CONTAINED) {
964        // Use offset list to try all possibilities.
965        offsets.setMaxLength(maxLength8);
966    }
967    int32_t pos=spanLength, rest=length-pos;
968    int32_t i, stringsLength=strings.size();
969    uint8_t *spanUTF8Lengths=spanLengths;
970    if(all) {
971        spanUTF8Lengths+=2*stringsLength;
972    }
973    for(;;) {
974        const uint8_t *s8=utf8;
975        int32_t length8;
976        if(spanCondition==USET_SPAN_CONTAINED) {
977            for(i=0; i<stringsLength; ++i) {
978                length8=utf8Lengths[i];
979                if(length8==0) {
980                    continue;  // String not representable in UTF-8.
981                }
982                int32_t overlap=spanUTF8Lengths[i];
983                if(overlap==ALL_CP_CONTAINED) {
984                    s8+=length8;
985                    continue;  // Irrelevant string.
986                }
987
988                // Try to match this string at pos-overlap..pos.
989                if(overlap>=LONG_SPAN) {
990                    overlap=length8;
991                    // While contained: No point matching fully inside the code point span.
992                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
993                }
994                if(overlap>spanLength) {
995                    overlap=spanLength;
996                }
997                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
998                for(;;) {
999                    if(inc>rest) {
1000                        break;
1001                    }
1002                    // Try to match if the increment is not listed already.
1003                    // Match at code point boundaries. (The UTF-8 strings were converted
1004                    // from UTF-16 and are guaranteed to be well-formed.)
1005                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1006                        !offsets.containsOffset(inc) &&
1007                        matches8(s+pos-overlap, s8, length8)
1008
1009                    ) {
1010                        if(inc==rest) {
1011                            return length;  // Reached the end of the string.
1012                        }
1013                        offsets.addOffset(inc);
1014                    }
1015                    if(overlap==0) {
1016                        break;
1017                    }
1018                    --overlap;
1019                    ++inc;
1020                }
1021                s8+=length8;
1022            }
1023        } else /* USET_SPAN_SIMPLE */ {
1024            int32_t maxInc=0, maxOverlap=0;
1025            for(i=0; i<stringsLength; ++i) {
1026                length8=utf8Lengths[i];
1027                if(length8==0) {
1028                    continue;  // String not representable in UTF-8.
1029                }
1030                int32_t overlap=spanUTF8Lengths[i];
1031                // For longest match, we do need to try to match even an all-contained string
1032                // to find the match from the earliest start.
1033
1034                // Try to match this string at pos-overlap..pos.
1035                if(overlap>=LONG_SPAN) {
1036                    overlap=length8;
1037                    // Longest match: Need to match fully inside the code point span
1038                    // to find the match from the earliest start.
1039                }
1040                if(overlap>spanLength) {
1041                    overlap=spanLength;
1042                }
1043                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1044                for(;;) {
1045                    if(inc>rest || overlap<maxOverlap) {
1046                        break;
1047                    }
1048                    // Try to match if the string is longer or starts earlier.
1049                    // Match at code point boundaries. (The UTF-8 strings were converted
1050                    // from UTF-16 and are guaranteed to be well-formed.)
1051                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1052                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1053                        matches8(s+pos-overlap, s8, length8)
1054
1055                    ) {
1056                        maxInc=inc;  // Longest match from earliest start.
1057                        maxOverlap=overlap;
1058                        break;
1059                    }
1060                    --overlap;
1061                    ++inc;
1062                }
1063                s8+=length8;
1064            }
1065
1066            if(maxInc!=0 || maxOverlap!=0) {
1067                // Longest-match algorithm, and there was a string match.
1068                // Simply continue after it.
1069                pos+=maxInc;
1070                rest-=maxInc;
1071                if(rest==0) {
1072                    return length;  // Reached the end of the string.
1073                }
1074                spanLength=0;  // Match strings from after a string match.
1075                continue;
1076            }
1077        }
1078        // Finished trying to match all strings at pos.
1079
1080        if(spanLength!=0 || pos==0) {
1081            // The position is after an unlimited code point span (spanLength!=0),
1082            // not after a string match.
1083            // The only position where spanLength==0 after a span is pos==0.
1084            // Otherwise, an unlimited code point span is only tried again when no
1085            // strings match, and if such a non-initial span fails we stop.
1086            if(offsets.isEmpty()) {
1087                return pos;  // No strings matched after a span.
1088            }
1089            // Match strings from after the next string match.
1090        } else {
1091            // The position is after a string match (or a single code point).
1092            if(offsets.isEmpty()) {
1093                // No more strings matched after a previous string match.
1094                // Try another code point span from after the last string match.
1095                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1096                if( spanLength==rest || // Reached the end of the string, or
1097                    spanLength==0       // neither strings nor span progressed.
1098                ) {
1099                    return pos+spanLength;
1100                }
1101                pos+=spanLength;
1102                rest-=spanLength;
1103                continue;  // spanLength>0: Match strings from after a span.
1104            } else {
1105                // Try to match only one code point from after a string match if some
1106                // string matched beyond it, so that we try all possible positions
1107                // and don't overshoot.
1108                spanLength=spanOneUTF8(spanSet, s+pos, rest);
1109                if(spanLength>0) {
1110                    if(spanLength==rest) {
1111                        return length;  // Reached the end of the string.
1112                    }
1113                    // Match strings after this code point.
1114                    // There cannot be any increments below it because UnicodeSet strings
1115                    // contain multiple code points.
1116                    pos+=spanLength;
1117                    rest-=spanLength;
1118                    offsets.shift(spanLength);
1119                    spanLength=0;
1120                    continue;  // Match strings from after a single code point.
1121                }
1122                // Match strings from after the next string match.
1123            }
1124        }
1125        int32_t minOffset=offsets.popMinimum();
1126        pos+=minOffset;
1127        rest-=minOffset;
1128        spanLength=0;  // Match strings from after a string match.
1129    }
1130}
1131
1132int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1133    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1134        return spanNotBackUTF8(s, length);
1135    }
1136    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1137    if(pos==0) {
1138        return 0;
1139    }
1140    int32_t spanLength=length-pos;
1141
1142    // Consider strings; they may overlap with the span.
1143    OffsetList offsets;
1144    if(spanCondition==USET_SPAN_CONTAINED) {
1145        // Use offset list to try all possibilities.
1146        offsets.setMaxLength(maxLength8);
1147    }
1148    int32_t i, stringsLength=strings.size();
1149    uint8_t *spanBackUTF8Lengths=spanLengths;
1150    if(all) {
1151        spanBackUTF8Lengths+=3*stringsLength;
1152    }
1153    for(;;) {
1154        const uint8_t *s8=utf8;
1155        int32_t length8;
1156        if(spanCondition==USET_SPAN_CONTAINED) {
1157            for(i=0; i<stringsLength; ++i) {
1158                length8=utf8Lengths[i];
1159                if(length8==0) {
1160                    continue;  // String not representable in UTF-8.
1161                }
1162                int32_t overlap=spanBackUTF8Lengths[i];
1163                if(overlap==ALL_CP_CONTAINED) {
1164                    s8+=length8;
1165                    continue;  // Irrelevant string.
1166                }
1167
1168                // Try to match this string at pos-(length8-overlap)..pos-length8.
1169                if(overlap>=LONG_SPAN) {
1170                    overlap=length8;
1171                    // While contained: No point matching fully inside the code point span.
1172                    int32_t len1=0;
1173                    U8_FWD_1(s8, len1, overlap);
1174                    overlap-=len1;  // Length of the string minus the first code point.
1175                }
1176                if(overlap>spanLength) {
1177                    overlap=spanLength;
1178                }
1179                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1180                for(;;) {
1181                    if(dec>pos) {
1182                        break;
1183                    }
1184                    // Try to match if the decrement is not listed already.
1185                    // Match at code point boundaries. (The UTF-8 strings were converted
1186                    // from UTF-16 and are guaranteed to be well-formed.)
1187                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1188                        !offsets.containsOffset(dec) &&
1189                        matches8(s+pos-dec, s8, length8)
1190                    ) {
1191                        if(dec==pos) {
1192                            return 0;  // Reached the start of the string.
1193                        }
1194                        offsets.addOffset(dec);
1195                    }
1196                    if(overlap==0) {
1197                        break;
1198                    }
1199                    --overlap;
1200                    ++dec;
1201                }
1202                s8+=length8;
1203            }
1204        } else /* USET_SPAN_SIMPLE */ {
1205            int32_t maxDec=0, maxOverlap=0;
1206            for(i=0; i<stringsLength; ++i) {
1207                length8=utf8Lengths[i];
1208                if(length8==0) {
1209                    continue;  // String not representable in UTF-8.
1210                }
1211                int32_t overlap=spanBackUTF8Lengths[i];
1212                // For longest match, we do need to try to match even an all-contained string
1213                // to find the match from the latest end.
1214
1215                // Try to match this string at pos-(length8-overlap)..pos-length8.
1216                if(overlap>=LONG_SPAN) {
1217                    overlap=length8;
1218                    // Longest match: Need to match fully inside the code point span
1219                    // to find the match from the latest end.
1220                }
1221                if(overlap>spanLength) {
1222                    overlap=spanLength;
1223                }
1224                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1225                for(;;) {
1226                    if(dec>pos || overlap<maxOverlap) {
1227                        break;
1228                    }
1229                    // Try to match if the string is longer or ends later.
1230                    // Match at code point boundaries. (The UTF-8 strings were converted
1231                    // from UTF-16 and are guaranteed to be well-formed.)
1232                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1233                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1234                        matches8(s+pos-dec, s8, length8)
1235                    ) {
1236                        maxDec=dec;  // Longest match from latest end.
1237                        maxOverlap=overlap;
1238                        break;
1239                    }
1240                    --overlap;
1241                    ++dec;
1242                }
1243                s8+=length8;
1244            }
1245
1246            if(maxDec!=0 || maxOverlap!=0) {
1247                // Longest-match algorithm, and there was a string match.
1248                // Simply continue before it.
1249                pos-=maxDec;
1250                if(pos==0) {
1251                    return 0;  // Reached the start of the string.
1252                }
1253                spanLength=0;  // Match strings from before a string match.
1254                continue;
1255            }
1256        }
1257        // Finished trying to match all strings at pos.
1258
1259        if(spanLength!=0 || pos==length) {
1260            // The position is before an unlimited code point span (spanLength!=0),
1261            // not before a string match.
1262            // The only position where spanLength==0 before a span is pos==length.
1263            // Otherwise, an unlimited code point span is only tried again when no
1264            // strings match, and if such a non-initial span fails we stop.
1265            if(offsets.isEmpty()) {
1266                return pos;  // No strings matched before a span.
1267            }
1268            // Match strings from before the next string match.
1269        } else {
1270            // The position is before a string match (or a single code point).
1271            if(offsets.isEmpty()) {
1272                // No more strings matched before a previous string match.
1273                // Try another code point span from before the last string match.
1274                int32_t oldPos=pos;
1275                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1276                spanLength=oldPos-pos;
1277                if( pos==0 ||           // Reached the start of the string, or
1278                    spanLength==0       // neither strings nor span progressed.
1279                ) {
1280                    return pos;
1281                }
1282                continue;  // spanLength>0: Match strings from before a span.
1283            } else {
1284                // Try to match only one code point from before a string match if some
1285                // string matched beyond it, so that we try all possible positions
1286                // and don't overshoot.
1287                spanLength=spanOneBackUTF8(spanSet, s, pos);
1288                if(spanLength>0) {
1289                    if(spanLength==pos) {
1290                        return 0;  // Reached the start of the string.
1291                    }
1292                    // Match strings before this code point.
1293                    // There cannot be any decrements below it because UnicodeSet strings
1294                    // contain multiple code points.
1295                    pos-=spanLength;
1296                    offsets.shift(spanLength);
1297                    spanLength=0;
1298                    continue;  // Match strings from before a single code point.
1299                }
1300                // Match strings from before the next string match.
1301            }
1302        }
1303        pos-=offsets.popMinimum();
1304        spanLength=0;  // Match strings from before a string match.
1305    }
1306}
1307
1308/*
1309 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1310 *
1311 * Theoretical algorithm:
1312 * - Iterate through the string, and at each code point boundary:
1313 *   + If the code point there is in the set, then return with the current position.
1314 *   + If a set string matches at the current position, then return with the current position.
1315 *
1316 * Optimized implementation:
1317 *
1318 * (Same assumption as for span() above.)
1319 *
1320 * Create and cache a spanNotSet which contains all of the single code points
1321 * of the original set but none of its strings.
1322 * For each set string add its initial code point to the spanNotSet.
1323 * (Also add its final code point for spanNotBack().)
1324 *
1325 * - Loop:
1326 *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1327 *   + If the current code point is in the original set, then
1328 *     return the current position.
1329 *   + If any set string matches at the current position, then
1330 *     return the current position.
1331 *   + If there is no match at the current position, neither for the code point there
1332 *     nor for any set string, then skip this code point and continue the loop.
1333 *     This happens for set-string-initial code points that were added to spanNotSet
1334 *     when there is not actually a match for such a set string.
1335 */
1336
1337int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1338    int32_t pos=0, rest=length;
1339    int32_t i, stringsLength=strings.size();
1340    do {
1341        // Span until we find a code point from the set,
1342        // or a code point that starts or ends some string.
1343        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1344        if(i==rest) {
1345            return length;  // Reached the end of the string.
1346        }
1347        pos+=i;
1348        rest-=i;
1349
1350        // Check whether the current code point is in the original set,
1351        // without the string starts and ends.
1352        int32_t cpLength=spanOne(spanSet, s+pos, rest);
1353        if(cpLength>0) {
1354            return pos;  // There is a set element at pos.
1355        }
1356
1357        // Try to match the strings at pos.
1358        for(i=0; i<stringsLength; ++i) {
1359            if(spanLengths[i]==ALL_CP_CONTAINED) {
1360                continue;  // Irrelevant string.
1361            }
1362            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1363            const UChar *s16=string.getBuffer();
1364            int32_t length16=string.length();
1365            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1366                return pos;  // There is a set element at pos.
1367            }
1368        }
1369
1370        // The span(while not contained) ended on a string start/end which is
1371        // not in the original set. Skip this code point and continue.
1372        // cpLength<0
1373        pos-=cpLength;
1374        rest+=cpLength;
1375    } while(rest!=0);
1376    return length;  // Reached the end of the string.
1377}
1378
1379int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1380    int32_t pos=length;
1381    int32_t i, stringsLength=strings.size();
1382    do {
1383        // Span until we find a code point from the set,
1384        // or a code point that starts or ends some string.
1385        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1386        if(pos==0) {
1387            return 0;  // Reached the start of the string.
1388        }
1389
1390        // Check whether the current code point is in the original set,
1391        // without the string starts and ends.
1392        int32_t cpLength=spanOneBack(spanSet, s, pos);
1393        if(cpLength>0) {
1394            return pos;  // There is a set element at pos.
1395        }
1396
1397        // Try to match the strings at pos.
1398        for(i=0; i<stringsLength; ++i) {
1399            // Use spanLengths rather than a spanBackLengths pointer because
1400            // it is easier and we only need to know whether the string is irrelevant
1401            // which is the same in either array.
1402            if(spanLengths[i]==ALL_CP_CONTAINED) {
1403                continue;  // Irrelevant string.
1404            }
1405            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1406            const UChar *s16=string.getBuffer();
1407            int32_t length16=string.length();
1408            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1409                return pos;  // There is a set element at pos.
1410            }
1411        }
1412
1413        // The span(while not contained) ended on a string start/end which is
1414        // not in the original set. Skip this code point and continue.
1415        // cpLength<0
1416        pos+=cpLength;
1417    } while(pos!=0);
1418    return 0;  // Reached the start of the string.
1419}
1420
1421int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1422    int32_t pos=0, rest=length;
1423    int32_t i, stringsLength=strings.size();
1424    uint8_t *spanUTF8Lengths=spanLengths;
1425    if(all) {
1426        spanUTF8Lengths+=2*stringsLength;
1427    }
1428    do {
1429        // Span until we find a code point from the set,
1430        // or a code point that starts or ends some string.
1431        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1432        if(i==rest) {
1433            return length;  // Reached the end of the string.
1434        }
1435        pos+=i;
1436        rest-=i;
1437
1438        // Check whether the current code point is in the original set,
1439        // without the string starts and ends.
1440        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1441        if(cpLength>0) {
1442            return pos;  // There is a set element at pos.
1443        }
1444
1445        // Try to match the strings at pos.
1446        const uint8_t *s8=utf8;
1447        int32_t length8;
1448        for(i=0; i<stringsLength; ++i) {
1449            length8=utf8Lengths[i];
1450            // ALL_CP_CONTAINED: Irrelevant string.
1451            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1452                return pos;  // There is a set element at pos.
1453            }
1454            s8+=length8;
1455        }
1456
1457        // The span(while not contained) ended on a string start/end which is
1458        // not in the original set. Skip this code point and continue.
1459        // cpLength<0
1460        pos-=cpLength;
1461        rest+=cpLength;
1462    } while(rest!=0);
1463    return length;  // Reached the end of the string.
1464}
1465
1466int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1467    int32_t pos=length;
1468    int32_t i, stringsLength=strings.size();
1469    uint8_t *spanBackUTF8Lengths=spanLengths;
1470    if(all) {
1471        spanBackUTF8Lengths+=3*stringsLength;
1472    }
1473    do {
1474        // Span until we find a code point from the set,
1475        // or a code point that starts or ends some string.
1476        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1477        if(pos==0) {
1478            return 0;  // Reached the start of the string.
1479        }
1480
1481        // Check whether the current code point is in the original set,
1482        // without the string starts and ends.
1483        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1484        if(cpLength>0) {
1485            return pos;  // There is a set element at pos.
1486        }
1487
1488        // Try to match the strings at pos.
1489        const uint8_t *s8=utf8;
1490        int32_t length8;
1491        for(i=0; i<stringsLength; ++i) {
1492            length8=utf8Lengths[i];
1493            // ALL_CP_CONTAINED: Irrelevant string.
1494            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1495                return pos;  // There is a set element at pos.
1496            }
1497            s8+=length8;
1498        }
1499
1500        // The span(while not contained) ended on a string start/end which is
1501        // not in the original set. Skip this code point and continue.
1502        // cpLength<0
1503        pos+=cpLength;
1504    } while(pos!=0);
1505    return 0;  // Reached the start of the string.
1506}
1507
1508U_NAMESPACE_END
1509