1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Copyright (C) 2007, International Business Machines
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  unisetspan.cpp
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 2007mar01
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus W. Scherer
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unisetspan.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * List of offsets from the current position from where to try matching
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * a code point or a string.
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Store offsets rather than indexes to simplify the code and use the same list
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for both increments (in span()) and decrements (in spanBack()).
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Assumption: The maximum offset is limited, and the offsets that are stored
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * at any one time are relatively dense, that is, there are normally no gaps of
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * hundreds or thousands of offset values.
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The implementation uses a circular buffer of byte flags,
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * each indicating whether the corresponding offset is in the list.
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This avoids inserting into a sorted list of offsets (or absolute indexes) and
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * physically moving part of the list.
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In principle, the caller should setMaxLength() to the maximum of the
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * max string length and U16_LENGTH/U8_LENGTH to account for
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * "long" single code points.
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * However, this implementation uses at least a staticList with more than
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U8_LENGTH entries anyway.
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: If maxLength were guaranteed to be no more than 32 or 64,
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the list could be stored as bit flags in a single integer.
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Rather than handling a circular buffer with a start list index,
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the integer would simply be shifted when lower offsets are removed.
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeSet does not have a limit on the lengths of strings.
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~OffsetList() {
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(list!=staticList) {
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            uprv_free(list);
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Call exactly once if the list is to be used.
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void setMaxLength(int32_t maxLength) {
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(maxLength<=(int32_t)sizeof(staticList)) {
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            capacity=(int32_t)sizeof(staticList);
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            UBool *l=(UBool *)uprv_malloc(maxLength);
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(l!=NULL) {
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                list=l;
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                capacity=maxLength;
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_memset(list, 0, capacity);
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void clear() {
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_memset(list, 0, capacity);
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=length=0;
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool isEmpty() const {
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return (UBool)(length==0);
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Reduce all stored offsets by delta, used when the current position
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // moves by delta.
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // There must not be any offsets lower than delta.
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // If there is an offset equal to delta, it is removed.
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // delta=[1..maxLength]
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void shift(int32_t delta) {
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start+delta;
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i>=capacity) {
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i-=capacity;
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(list[i]) {
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            list[i]=FALSE;
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            --length;
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=i;
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Add an offset. The list must not contain it yet.
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // offset=[1..maxLength]
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void addOffset(int32_t offset) {
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start+offset;
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i>=capacity) {
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i-=capacity;
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        list[i]=TRUE;
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ++length;
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // offset=[1..maxLength]
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool containsOffset(int32_t offset) const {
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start+offset;
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i>=capacity) {
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i-=capacity;
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return list[i];
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Find the lowest stored offset from a non-empty list, remove it,
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // and reduce all other offsets by this minimum.
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Returns [1..maxLength].
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t popMinimum() {
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Look for the next offset in list[start+1..capacity-1].
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start, result;
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while(++i<capacity) {
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(list[i]) {
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                list[i]=FALSE;
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                --length;
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                result=i-start;
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start=i;
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return result;
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // i==capacity
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Wrap around and look for the next offset in list[0..start].
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Since the list is not empty, there will be one.
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        result=capacity-start;
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        i=0;
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while(!list[i]) {
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ++i;
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        list[i]=FALSE;
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        --length;
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=i;
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return result+=i;
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool *list;
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t capacity;
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t length;
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t start;
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool staticList[16];
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugetUTF8Length(const UChar *s, int32_t length) {
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode errorCode=U_ZERO_ERROR;
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t length8=0;
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length8;
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The string contains an unpaired surrogate.
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Ignore this string.
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruappendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode errorCode=U_ZERO_ERROR;
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t length8=0;
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_SUCCESS(errorCode)) {
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length8;
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The string contains an unpaired surrogate.
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Ignore this string.
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline uint8_t
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerumakeSpanLengthByte(int32_t spanLength) {
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Construct for all variants of span(), or only for any one variant.
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Initialize as little as possible, for single use.
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           const UVector &setStrings,
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           uint32_t which)
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Length(0),
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          maxLength16(0), maxLength8(0),
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          all((UBool)(which==ALL)) {
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    spanSet.retainAll(set);
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(which&NOT_CONTAINED) {
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Default to the same sets.
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // addToSpanNotSet() will create a separate set if necessary.
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet=&spanSet;
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Determine if the strings even need to be taken into account at all for span() etc.
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // If any string is relevant, then all strings need to be used for
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // span(longest match) but only the relevant ones for span(while contained).
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t stringsLength=strings.size();
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, spanLength;
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool someRelevant=FALSE;
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i=0; i<stringsLength; ++i) {
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UChar *s16=string.getBuffer();
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length16=string.length();
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UBool thisRelevant;
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength<length16) {  // Relevant string.
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            someRelevant=thisRelevant=TRUE;
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            thisRelevant=FALSE;
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((which&UTF16) && length16>maxLength16) {
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            maxLength16=length16;
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t length8=getUTF8Length(s16, length16);
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utf8Length+=length8;
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length8>maxLength8) {
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                maxLength8=length8;
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!someRelevant) {
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        maxLength16=maxLength8=0;
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Freeze after checking for the need to use strings at all because freezing
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // a set takes some time and memory which are wasted if there are no relevant strings.
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanSet.freeze();
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackLengths;
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanUTF8Lengths;
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackUTF8Lengths;
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Allocate a block of meta data.
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t allocSize;
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        allocSize=stringsLength;  // One set of span lengths.
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(which&UTF8) {
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // UTF-8 lengths and UTF-8 strings.
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            allocSize+=stringsLength*4+utf8Length;
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(allocSize<=(int32_t)sizeof(staticLengths)) {
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=staticLengths;
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(utf8Lengths==NULL) {
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Out of memory.
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Store span lengths for all span() variants.
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackLengths=spanLengths+stringsLength;
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanUTF8Lengths=spanBackLengths+stringsLength;
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8=spanBackUTF8Lengths+stringsLength;
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Store span lengths for only one span() variant.
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(which&UTF8) {
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utf8=spanLengths+stringsLength;
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            spanLengths=(uint8_t *)utf8Lengths;
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i=0; i<stringsLength; ++i) {
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UChar *s16=string.getBuffer();
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length16=string.length();
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength<length16) {  // Relevant string.
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&UTF16) {
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&CONTAINED) {
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(which&FWD) {
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanLengths[i]=makeSpanLengthByte(spanLength);
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(which&BACK) {
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&UTF8) {
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                uint8_t *s8=utf8+utf8Count;
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                utf8Count+=utf8Lengths[i]=length8;
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {  // Relevant for UTF-8.
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(which&CONTAINED) {
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(which&FWD) {
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(which&BACK) {
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&NOT_CONTAINED) {
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Add string start and end code points to the spanNotSet so that
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // a span(while not contained) stops before any string.
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                UChar32 c;
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&FWD) {
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len=0;
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_NEXT(s16, len, length16, c);
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    addToSpanNotSet(c);
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&BACK) {
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len=length16;
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_PREV(s16, 0, len, c);
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    addToSpanNotSet(c);
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {  // Irrelevant string.
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&UTF8) {
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uint8_t *s8=utf8+utf8Count;
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    utf8Count+=utf8Lengths[i]=length8;
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    utf8Lengths[i]=0;
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(all) {
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLengths[i]=spanBackLengths[i]=
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        (uint8_t)ALL_CP_CONTAINED;
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // All spanXYZLengths pointers contain the same address.
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Finish.
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet->freeze();
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Copy constructor. Assumes which==ALL for a frozen set.
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           const UVector &newParentSetStrings)
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Length(otherStringSpan.utf8Length),
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          all(TRUE) {
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet=&spanSet;
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Allocate a block of meta data.
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t stringsLength=strings.size();
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(allocSize<=(int32_t)sizeof(staticLengths)) {
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=staticLengths;
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(utf8Lengths==NULL) {
411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Out of memory.
413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utf8=spanLengths+stringsLength*4;
418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::~UnicodeSetStringSpan() {
422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete pSpanNotSet;
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_free(utf8Lengths);
427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanSet.contains(c)) {
433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Nothing to do.
434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(newSet==NULL) {
437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Out of memory.
438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            pSpanNotSet=newSet;
440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    pSpanNotSet->add(c);
443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare strings without any argument checks. Requires length>0.
446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool
447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16(const UChar *s, const UChar *t, int32_t length) {
448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(*s++!=*t++) {
450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return FALSE;
451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(--length>0);
453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return TRUE;
454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool
457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches8(const uint8_t *s, const uint8_t *t, int32_t length) {
458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(*s++!=*t++) {
460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return FALSE;
461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(--length>0);
463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return TRUE;
464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare 16-bit Unicode strings (which may be malformed UTF-16)
467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// at code point boundaries.
468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// That is, each edge of a match must not be in the middle of a surrogate pair.
469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool
470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    s+=start;
472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    limit-=start;
473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return matches16(s, t, length) &&
474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Does the set contain the next code point?
479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// If so, return its length; otherwise return its negative length.
480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar c=*s, c2;
483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? 1 : -1;
487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar c=s[length-1], c2;
492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? 1 : -1;
496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c=*s;
501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((int8_t)c>=0) {
502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(c) ? 1 : -1;
503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Take advantage of non-ASCII fastpaths in U8_NEXT().
505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i=0;
506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    U8_NEXT(s, i, length, c);
507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? i : -i;
508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c=s[length-1];
513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((int8_t)c>=0) {
514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(c) ? 1 : -1;
515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i=length-1;
517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    length-=i;
519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? length : -length;
520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In span() when spanLength==0 (after a string match, or at the beginning
524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * after an empty code point span) and in spanNot() and spanNotUTF8(),
525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * string matching could use a binary search
526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because all string matches are done from the same start index.
527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For UTF-8, this would require a comparison function that returns UTF-16 order.
529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This optimization should not be necessary for normal UnicodeSets because
531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * most sets have no strings, and most sets with strings have
532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * very few very short strings.
533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For cases with many strings, it might be better to use a different API
534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and implementation with a DFA (state machine).
535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_CONTAINED)
539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm:
541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary:
542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the code point there is in the set, then remember to continue after it.
543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If a set string matches at the current position, then remember to continue after it.
544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Either recursively span for each code point or string match,
545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     or recursively span for all but the shortest one and
546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     iteratively continue the span with the shortest local match.
547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Remember the longest recursive span (the farthest end point).
548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If there is no match at the current position, neither for the code point there
549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     nor for any set string, then stop and return the longest recursive span length.
550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation:
552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (We assume that most sets will have very few very short strings.
554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A span using a string-less set is extremely fast.)
555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanSet which contains all of the single code points
557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings.
558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop:
561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Try to match each set string at the end of the spanLength.
562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that start with set-contained code points must be matched
563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       with a partial overlap because the recursive algorithm would have tried
564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       to match them at every position.
565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that entirely consist of set-contained code points
566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       are irrelevant for span(USET_SPAN_CONTAINED) because the
567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       recursive algorithm would continue after them anyway
568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       and find the longest recursive match from their end.
569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Rather than recursing, note each end point of a set string match.
570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no set string matched after spanSet.span(), then return
571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     with where the spanSet.span() ended.
572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If at least one set string matched after spanSet.span(), then
573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     pop the shortest string match end point and continue
574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     the loop, trying to match all set strings from there.
575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If at least one more set string matched after a previous string match,
576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then test if the code point after the previous string match is also
577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     contained in the set.
578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     Continue the loop with the shortest end point of either this code point
579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     or a matching set string.
580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no more set string matched after a previous string match,
581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     Stop if spanLength==0, otherwise continue the loop.
583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * By noting each end point of a set string match,
585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the function visits each string position at most once and finishes
586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in linear time.
587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The recursive algorithm may visit the same string position many times
589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * if multiple paths lead to it and finishes in exponential time.
590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_SIMPLE)
594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm:
596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary:
597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the code point there is in the set, then remember to continue after it.
598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If a set string matches at the current position, then remember to continue after it.
599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Continue from the farthest match position and ignore all others.
600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If there is no match at the current position,
601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then stop and return the current position.
602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation:
604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption and spanSet as above.)
606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop:
609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Try to match each set string at the end of the spanLength.
610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that start with set-contained code points must be matched
611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       with a partial overlap because the standard algorithm would have tried
612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       to match them earlier.
613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that entirely consist of set-contained code points
614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       must be matched with a full overlap because the longest-match algorithm
615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       would hide set string matches that end earlier.
616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       Such set strings need not be matched earlier inside the code point span
617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       because the standard algorithm would then have continued after
618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       the set string match anyway.
619ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Remember the longest set string match (farthest end point) from the earliest
620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       starting point.
621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no set string matched after spanSet.span(), then return
622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     with where the spanSet.span() ended.
623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If at least one set string matched, then continue the loop after the
624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     longest match from the earliest position.
625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no more set string matched after a previous string match,
626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     Stop if spanLength==0, otherwise continue the loop.
628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
632ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNot(s, length);
633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanLength==length) {
636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length;
637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength16);
644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanLength, rest=length-pos;
646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanLengths[i];
651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest) {
670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the increment is not listed already.
673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(inc==rest) {
675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return length;  // Reached the end of the string.
676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(inc);
678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxInc=0, maxOverlap=0;
688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanLengths[i];
690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the earliest start.
692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
693ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
694ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
695ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
698ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the earliest start.
702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest || overlap<maxOverlap) {
709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or starts earlier.
712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches16CPB(s, pos-overlap, length, s16, length16)
714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxInc=inc;  // Longest match from earliest start.
716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxInc!=0 || maxOverlap!=0) {
725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
726ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue after it.
727ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=maxInc;
728ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=maxInc;
729ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(rest==0) {
730ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return length;  // Reached the end of the string.
731ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
732ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from after a string match.
733ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
734ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
735ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
736ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
737ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
738ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==0) {
739ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after an unlimited code point span (spanLength!=0),
740ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not after a string match.
741ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 after a span is pos==0.
742ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
743ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
744ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
745ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched after a span.
746ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
747ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from after the next string match.
748ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
749ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after a string match (or a single code point).
750ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
751ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched after a previous string match.
752ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from after the last string match.
753ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
754ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( spanLength==rest || // Reached the end of the string, or
755ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
756ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
757ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos+spanLength;
758ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
759ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=spanLength;
760ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=spanLength;
761ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from after a span.
762ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
763ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from after a string match if some
764ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
765ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
766ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOne(spanSet, s+pos, rest);
767ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
768ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==rest) {
769ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return length;  // Reached the end of the string.
770ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
771ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings after this code point.
772ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any increments below it because UnicodeSet strings
773ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
774ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos+=spanLength;
775ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    rest-=spanLength;
776ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
777ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
778ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from after a single code point.
779ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
780ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from after the next string match.
781ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
782ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
783ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t minOffset=offsets.popMinimum();
784ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=minOffset;
785ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=minOffset;
786ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from after a string match.
787ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
788ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
789ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
790ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
791ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
792ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNotBack(s, length);
793ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
794ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
795ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pos==0) {
796ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
797ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
798ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=length-pos;
799ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
800ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
801ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
802ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
803ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
804ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength16);
805ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
806ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
807ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackLengths=spanLengths;
808ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
809ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackLengths+=stringsLength;
810ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
811ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
812ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
813ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
814ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackLengths[i];
815ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
816ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
817ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
818ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
819ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
820ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
821ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
822ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length16-overlap)..pos-length16.
823ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
824ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
825ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
826ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len1=0;
827ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_FWD_1(s16, len1, overlap);
828ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap-=len1;  // Length of the string minus the first code point.
829ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
830ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
831ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
832ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
833ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
834ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
835ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos) {
836ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
837ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
838ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the decrement is not listed already.
839ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
840ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(dec==pos) {
841ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return 0;  // Reached the start of the string.
842ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
843ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(dec);
844ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
845ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
846ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
847ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
848ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
849ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
850ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
851ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
852ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
853ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxDec=0, maxOverlap=0;
854ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
855ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackLengths[i];
856ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
857ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the latest end.
858ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
859ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
860ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
861ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
862ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
863ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length16-overlap)..pos-length16.
864ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
865ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
866ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
867ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the latest end.
868ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
869ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
870ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
871ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
872ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
873ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
874ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos || overlap<maxOverlap) {
875ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
876ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
877ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or ends later.
878ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
879ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches16CPB(s, pos-dec, length, s16, length16)
880ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
881ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxDec=dec;  // Longest match from latest end.
882ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
883ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
884ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
885ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
886ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
887ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
888ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
889ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
890ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxDec!=0 || maxOverlap!=0) {
891ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
892ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue before it.
893ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos-=maxDec;
894ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(pos==0) {
895ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return 0;  // Reached the start of the string.
896ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
897ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from before a string match.
898ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
899ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
900ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
901ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
902ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
903ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==length) {
904ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before an unlimited code point span (spanLength!=0),
905ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not before a string match.
906ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 before a span is pos==length.
907ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
908ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
909ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
910ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched before a span.
911ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
912ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from before the next string match.
913ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
914ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before a string match (or a single code point).
915ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
916ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched before a previous string match.
917ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from before the last string match.
918ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t oldPos=pos;
919ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
920ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=oldPos-pos;
921ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( pos==0 ||           // Reached the start of the string, or
922ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
923ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
924ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos;
925ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
926ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from before a span.
927ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
928ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from before a string match if some
929ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
930ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
931ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOneBack(spanSet, s, pos);
932ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
933ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==pos) {
934ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return 0;  // Reached the start of the string.
935ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
936ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings before this code point.
937ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any decrements below it because UnicodeSet strings
938ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
939ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos-=spanLength;
940ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
941ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
942ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from before a single code point.
943ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
944ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from before the next string match.
945ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
946ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
947ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=offsets.popMinimum();
948ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from before a string match.
949ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
950ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
951ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
952ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
953ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
954ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNotUTF8(s, length);
955ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
956ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
957ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanLength==length) {
958ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length;
959ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
960ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
961ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
962ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
963ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
964ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
965ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength8);
966ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
967ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanLength, rest=length-pos;
968ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
969ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanUTF8Lengths=spanLengths;
970ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
971ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanUTF8Lengths+=2*stringsLength;
972ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
973ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
974ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
975ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
976ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
977ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
978ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
979ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
980ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
981ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
982ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanUTF8Lengths[i];
983ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
984ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    s8+=length8;
985ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
986ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
987ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
988ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
989ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
990ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
991ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
992ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
993ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
994ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
995ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
996ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
997ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
998ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
999ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest) {
1000ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1001ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1002ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the increment is not listed already.
1003ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1004ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1005ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1006ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        !offsets.containsOffset(inc) &&
1007ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-overlap, s8, length8)
1008ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1009ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1010ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(inc==rest) {
1011ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return length;  // Reached the end of the string.
1012ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
1013ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(inc);
1014ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1015ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
1016ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1017ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1018ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1019ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
1020ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1021ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1022ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1023ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
1024ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxInc=0, maxOverlap=0;
1025ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
1026ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
1027ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
1028ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
1029ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1030ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanUTF8Lengths[i];
1031ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
1032ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the earliest start.
1033ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1034ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
1035ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
1036ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
1037ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
1038ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the earliest start.
1039ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1040ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
1041ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
1042ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1043ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1044ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
1045ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest || overlap<maxOverlap) {
1046ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1047ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1048ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or starts earlier.
1049ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1050ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1051ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1052ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1053ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-overlap, s8, length8)
1054ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1055ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1056ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxInc=inc;  // Longest match from earliest start.
1057ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
1058ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1059ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1060ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1061ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
1062ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1063ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1064ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1065ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1066ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxInc!=0 || maxOverlap!=0) {
1067ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
1068ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue after it.
1069ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=maxInc;
1070ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=maxInc;
1071ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(rest==0) {
1072ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return length;  // Reached the end of the string.
1073ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1074ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from after a string match.
1075ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
1076ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1077ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1078ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
1079ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1080ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==0) {
1081ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after an unlimited code point span (spanLength!=0),
1082ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not after a string match.
1083ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 after a span is pos==0.
1084ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
1085ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
1086ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1087ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched after a span.
1088ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1089ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from after the next string match.
1090ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
1091ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after a string match (or a single code point).
1092ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1093ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched after a previous string match.
1094ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from after the last string match.
1095ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1096ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( spanLength==rest || // Reached the end of the string, or
1097ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
1098ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
1099ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos+spanLength;
1100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=spanLength;
1102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=spanLength;
1103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from after a span.
1104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
1105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from after a string match if some
1106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
1107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
1108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOneUTF8(spanSet, s+pos, rest);
1109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
1110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==rest) {
1111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return length;  // Reached the end of the string.
1112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings after this code point.
1114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any increments below it because UnicodeSet strings
1115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
1116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos+=spanLength;
1117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    rest-=spanLength;
1118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
1119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
1120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from after a single code point.
1121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from after the next string match.
1123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t minOffset=offsets.popMinimum();
1126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=minOffset;
1127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=minOffset;
1128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from after a string match.
1129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNotBackUTF8(s, length);
1135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pos==0) {
1138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
1139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=length-pos;
1141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
1143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
1144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
1145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
1146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength8);
1147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackUTF8Lengths=spanLengths;
1150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
1151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackUTF8Lengths+=3*stringsLength;
1152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
1154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
1155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
1156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
1157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
1158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
1159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
1160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
1161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackUTF8Lengths[i];
1163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
1164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    s8+=length8;
1165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
1166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length8-overlap)..pos-length8.
1169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
1170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
1171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
1172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len1=0;
1173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U8_FWD_1(s8, len1, overlap);
1174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap-=len1;  // Length of the string minus the first code point.
1175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
1177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
1178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
1181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos) {
1182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the decrement is not listed already.
1185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        !offsets.containsOffset(dec) &&
1189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-dec, s8, length8)
1190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(dec==pos) {
1192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return 0;  // Reached the start of the string.
1193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
1194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(dec);
1195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
1197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
1201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
1205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxDec=0, maxOverlap=0;
1206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
1207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
1208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
1209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
1210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackUTF8Lengths[i];
1212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
1213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the latest end.
1214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length8-overlap)..pos-length8.
1216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
1217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
1218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
1219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the latest end.
1220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
1222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
1223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
1226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos || overlap<maxOverlap) {
1227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or ends later.
1230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-dec, s8, length8)
1235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxDec=dec;  // Longest match from latest end.
1237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
1238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
1242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxDec!=0 || maxOverlap!=0) {
1247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
1248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue before it.
1249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos-=maxDec;
1250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(pos==0) {
1251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return 0;  // Reached the start of the string.
1252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from before a string match.
1254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
1255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
1258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==length) {
1260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before an unlimited code point span (spanLength!=0),
1261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not before a string match.
1262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 before a span is pos==length.
1263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
1264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
1265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched before a span.
1267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from before the next string match.
1269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
1270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before a string match (or a single code point).
1271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched before a previous string match.
1273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from before the last string match.
1274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t oldPos=pos;
1275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=oldPos-pos;
1277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( pos==0 ||           // Reached the start of the string, or
1278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
1279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
1280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos;
1281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from before a span.
1283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
1284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from before a string match if some
1285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
1286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
1287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOneBackUTF8(spanSet, s, pos);
1288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
1289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==pos) {
1290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return 0;  // Reached the start of the string.
1291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings before this code point.
1293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any decrements below it because UnicodeSet strings
1294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
1295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos-=spanLength;
1296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
1297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
1298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from before a single code point.
1299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from before the next string match.
1301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=offsets.popMinimum();
1304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from before a string match.
1305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
1309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm:
1312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary:
1313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the code point there is in the set, then return with the current position.
1314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If a set string matches at the current position, then return with the current position.
1315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation:
1317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption as for span() above.)
1319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanNotSet which contains all of the single code points
1321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings.
1322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For each set string add its initial code point to the spanNotSet.
1323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Also add its final code point for spanNotBack().)
1324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop:
1326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the current code point is in the original set, then
1328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     return the current position.
1329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If any set string matches at the current position, then
1330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     return the current position.
1331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If there is no match at the current position, neither for the code point there
1332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     nor for any set string, then skip this code point and continue the loop.
1333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     This happens for set-string-initial code points that were added to spanNotSet
1334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     when there is not actually a match for such a set string.
1335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
1336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=0, rest=length;
1339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i==rest) {
1345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return length;  // Reached the end of the string.
1346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=i;
1348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=i;
1349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOne(spanSet, s+pos, rest);
1353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(spanLengths[i]==ALL_CP_CONTAINED) {
1360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // Irrelevant string.
1361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UChar *s16=string.getBuffer();
1364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t length16=string.length();
1365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=cpLength;
1374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest+=cpLength;
1375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(rest!=0);
1376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return length;  // Reached the end of the string.
1377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=length;
1381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(pos==0) {
1387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return 0;  // Reached the start of the string.
1388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOneBack(spanSet, s, pos);
1393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Use spanLengths rather than a spanBackLengths pointer because
1400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // it is easier and we only need to know whether the string is irrelevant
1401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // which is the same in either array.
1402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(spanLengths[i]==ALL_CP_CONTAINED) {
1403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // Irrelevant string.
1404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UChar *s16=string.getBuffer();
1407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t length16=string.length();
1408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=cpLength;
1417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(pos!=0);
1418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;  // Reached the start of the string.
1419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=0, rest=length;
1423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanUTF8Lengths=spanLengths;
1425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
1426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanUTF8Lengths+=2*stringsLength;
1427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i==rest) {
1433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return length;  // Reached the end of the string.
1434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=i;
1436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=i;
1437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
1447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
1448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            length8=utf8Lengths[i];
1450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // ALL_CP_CONTAINED: Irrelevant string.
1451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            s8+=length8;
1455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=cpLength;
1461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest+=cpLength;
1462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(rest!=0);
1463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return length;  // Reached the end of the string.
1464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=length;
1468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackUTF8Lengths=spanLengths;
1470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
1471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackUTF8Lengths+=3*stringsLength;
1472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(pos==0) {
1478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return 0;  // Reached the start of the string.
1479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
1490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
1491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            length8=utf8Lengths[i];
1493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // ALL_CP_CONTAINED: Irrelevant string.
1494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            s8+=length8;
1498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=cpLength;
1504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(pos!=0);
1505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;  // Reached the start of the string.
1506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
1509