1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
48393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius*   Copyright (C) 2007-2012, International Business Machines
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  unisetspan.cpp
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 2007mar01
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus W. Scherer
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h"
20103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf8.h"
21103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf16.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unisetspan.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * List of offsets from the current position from where to try matching
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * a code point or a string.
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Store offsets rather than indexes to simplify the code and use the same list
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for both increments (in span()) and decrements (in spanBack()).
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Assumption: The maximum offset is limited, and the offsets that are stored
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * at any one time are relatively dense, that is, there are normally no gaps of
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * hundreds or thousands of offset values.
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The implementation uses a circular buffer of byte flags,
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * each indicating whether the corresponding offset is in the list.
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This avoids inserting into a sorted list of offsets (or absolute indexes) and
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * physically moving part of the list.
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In principle, the caller should setMaxLength() to the maximum of the
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * max string length and U16_LENGTH/U8_LENGTH to account for
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * "long" single code points.
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * However, this implementation uses at least a staticList with more than
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U8_LENGTH entries anyway.
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: If maxLength were guaranteed to be no more than 32 or 64,
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the list could be stored as bit flags in a single integer.
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Rather than handling a circular buffer with a start list index,
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the integer would simply be shifted when lower offsets are removed.
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeSet does not have a limit on the lengths of strings.
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~OffsetList() {
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(list!=staticList) {
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            uprv_free(list);
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Call exactly once if the list is to be used.
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void setMaxLength(int32_t maxLength) {
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(maxLength<=(int32_t)sizeof(staticList)) {
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            capacity=(int32_t)sizeof(staticList);
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            UBool *l=(UBool *)uprv_malloc(maxLength);
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(l!=NULL) {
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                list=l;
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                capacity=maxLength;
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_memset(list, 0, capacity);
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void clear() {
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_memset(list, 0, capacity);
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=length=0;
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool isEmpty() const {
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return (UBool)(length==0);
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Reduce all stored offsets by delta, used when the current position
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // moves by delta.
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // There must not be any offsets lower than delta.
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // If there is an offset equal to delta, it is removed.
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // delta=[1..maxLength]
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void shift(int32_t delta) {
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start+delta;
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i>=capacity) {
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i-=capacity;
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(list[i]) {
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            list[i]=FALSE;
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            --length;
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=i;
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Add an offset. The list must not contain it yet.
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // offset=[1..maxLength]
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void addOffset(int32_t offset) {
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start+offset;
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i>=capacity) {
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i-=capacity;
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        list[i]=TRUE;
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ++length;
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // offset=[1..maxLength]
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool containsOffset(int32_t offset) const {
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start+offset;
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i>=capacity) {
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i-=capacity;
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return list[i];
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Find the lowest stored offset from a non-empty list, remove it,
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // and reduce all other offsets by this minimum.
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Returns [1..maxLength].
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t popMinimum() {
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Look for the next offset in list[start+1..capacity-1].
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i=start, result;
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while(++i<capacity) {
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(list[i]) {
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                list[i]=FALSE;
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                --length;
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                result=i-start;
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start=i;
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return result;
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // i==capacity
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Wrap around and look for the next offset in list[0..start].
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Since the list is not empty, there will be one.
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        result=capacity-start;
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        i=0;
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while(!list[i]) {
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ++i;
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        list[i]=FALSE;
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        --length;
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=i;
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return result+=i;
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool *list;
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t capacity;
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t length;
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t start;
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool staticList[16];
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerugetUTF8Length(const UChar *s, int32_t length) {
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode errorCode=U_ZERO_ERROR;
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t length8=0;
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length8;
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The string contains an unpaired surrogate.
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Ignore this string.
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruappendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode errorCode=U_ZERO_ERROR;
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t length8=0;
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_SUCCESS(errorCode)) {
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length8;
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The string contains an unpaired surrogate.
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Ignore this string.
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline uint8_t
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerumakeSpanLengthByte(int32_t spanLength) {
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Construct for all variants of span(), or only for any one variant.
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Initialize as little as possible, for single use.
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           const UVector &setStrings,
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           uint32_t which)
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Length(0),
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          maxLength16(0), maxLength8(0),
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          all((UBool)(which==ALL)) {
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    spanSet.retainAll(set);
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(which&NOT_CONTAINED) {
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Default to the same sets.
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // addToSpanNotSet() will create a separate set if necessary.
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet=&spanSet;
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Determine if the strings even need to be taken into account at all for span() etc.
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // If any string is relevant, then all strings need to be used for
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // span(longest match) but only the relevant ones for span(while contained).
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t stringsLength=strings.size();
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, spanLength;
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool someRelevant=FALSE;
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i=0; i<stringsLength; ++i) {
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UChar *s16=string.getBuffer();
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length16=string.length();
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UBool thisRelevant;
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength<length16) {  // Relevant string.
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            someRelevant=thisRelevant=TRUE;
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            thisRelevant=FALSE;
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((which&UTF16) && length16>maxLength16) {
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            maxLength16=length16;
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t length8=getUTF8Length(s16, length16);
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utf8Length+=length8;
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length8>maxLength8) {
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                maxLength8=length8;
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(!someRelevant) {
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        maxLength16=maxLength8=0;
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Freeze after checking for the need to use strings at all because freezing
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // a set takes some time and memory which are wasted if there are no relevant strings.
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanSet.freeze();
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackLengths;
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanUTF8Lengths;
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackUTF8Lengths;
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Allocate a block of meta data.
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t allocSize;
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        allocSize=stringsLength;  // One set of span lengths.
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(which&UTF8) {
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // UTF-8 lengths and UTF-8 strings.
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            allocSize+=stringsLength*4+utf8Length;
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(allocSize<=(int32_t)sizeof(staticLengths)) {
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=staticLengths;
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(utf8Lengths==NULL) {
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Out of memory.
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Store span lengths for all span() variants.
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackLengths=spanLengths+stringsLength;
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanUTF8Lengths=spanBackLengths+stringsLength;
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8=spanBackUTF8Lengths+stringsLength;
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Store span lengths for only one span() variant.
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(which&UTF8) {
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utf8=spanLengths+stringsLength;
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            spanLengths=(uint8_t *)utf8Lengths;
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i=0; i<stringsLength; ++i) {
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const UChar *s16=string.getBuffer();
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length16=string.length();
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength<length16) {  // Relevant string.
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&UTF16) {
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&CONTAINED) {
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(which&FWD) {
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanLengths[i]=makeSpanLengthByte(spanLength);
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(which&BACK) {
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&UTF8) {
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                uint8_t *s8=utf8+utf8Count;
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                utf8Count+=utf8Lengths[i]=length8;
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {  // Relevant for UTF-8.
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(which&CONTAINED) {
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(which&FWD) {
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(which&BACK) {
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&NOT_CONTAINED) {
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Add string start and end code points to the spanNotSet so that
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // a span(while not contained) stops before any string.
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                UChar32 c;
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&FWD) {
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len=0;
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_NEXT(s16, len, length16, c);
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    addToSpanNotSet(c);
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&BACK) {
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len=length16;
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_PREV(s16, 0, len, c);
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    addToSpanNotSet(c);
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {  // Irrelevant string.
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(which&UTF8) {
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uint8_t *s8=utf8+utf8Count;
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    utf8Count+=utf8Lengths[i]=length8;
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    utf8Lengths[i]=0;
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(all) {
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLengths[i]=spanBackLengths[i]=
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        (uint8_t)ALL_CP_CONTAINED;
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // All spanXYZLengths pointers contain the same address.
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Finish.
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet->freeze();
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Copy constructor. Assumes which==ALL for a frozen set.
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           const UVector &newParentSetStrings)
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          utf8Length(otherStringSpan.utf8Length),
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          all(TRUE) {
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet=&spanSet;
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Allocate a block of meta data.
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t stringsLength=strings.size();
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(allocSize<=(int32_t)sizeof(staticLengths)) {
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=staticLengths;
410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(utf8Lengths==NULL) {
413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Out of memory.
415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utf8=spanLengths+stringsLength*4;
420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeSetStringSpan::~UnicodeSetStringSpan() {
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete pSpanNotSet;
426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_free(utf8Lengths);
429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanSet.contains(c)) {
435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Nothing to do.
436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(newSet==NULL) {
439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;  // Out of memory.
440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            pSpanNotSet=newSet;
442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    pSpanNotSet->add(c);
445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare strings without any argument checks. Requires length>0.
448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool
449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16(const UChar *s, const UChar *t, int32_t length) {
450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(*s++!=*t++) {
452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return FALSE;
453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(--length>0);
455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return TRUE;
456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool
459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches8(const uint8_t *s, const uint8_t *t, int32_t length) {
460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(*s++!=*t++) {
462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return FALSE;
463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(--length>0);
465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return TRUE;
466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Compare 16-bit Unicode strings (which may be malformed UTF-16)
469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// at code point boundaries.
470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// That is, each edge of a match must not be in the middle of a surrogate pair.
471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline UBool
472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querumatches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    s+=start;
474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    limit-=start;
475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return matches16(s, t, length) &&
476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Does the set contain the next code point?
481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// If so, return its length; otherwise return its negative length.
482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar c=*s, c2;
485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? 1 : -1;
489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar c=s[length-1], c2;
494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? 1 : -1;
498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c=*s;
503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((int8_t)c>=0) {
504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(c) ? 1 : -1;
505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
5068393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i=0;
5088393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius    U8_NEXT_OR_FFFD(s, i, length, c);
509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? i : -i;
510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic inline int32_t
513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruspanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c=s[length-1];
515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((int8_t)c>=0) {
516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return set.contains(c) ? 1 : -1;
517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i=length-1;
5198393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius    c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    length-=i;
521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return set.contains(c) ? length : -length;
522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note: In span() when spanLength==0 (after a string match, or at the beginning
526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * after an empty code point span) and in spanNot() and spanNotUTF8(),
527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * string matching could use a binary search
528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because all string matches are done from the same start index.
529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For UTF-8, this would require a comparison function that returns UTF-16 order.
531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This optimization should not be necessary for normal UnicodeSets because
533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * most sets have no strings, and most sets with strings have
534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * very few very short strings.
535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For cases with many strings, it might be better to use a different API
536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and implementation with a DFA (state machine).
537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_CONTAINED)
541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm:
543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary:
544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the code point there is in the set, then remember to continue after it.
545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If a set string matches at the current position, then remember to continue after it.
546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Either recursively span for each code point or string match,
547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     or recursively span for all but the shortest one and
548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     iteratively continue the span with the shortest local match.
549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Remember the longest recursive span (the farthest end point).
550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If there is no match at the current position, neither for the code point there
551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     nor for any set string, then stop and return the longest recursive span length.
552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation:
554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (We assume that most sets will have very few very short strings.
556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A span using a string-less set is extremely fast.)
557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanSet which contains all of the single code points
559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings.
560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop:
563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Try to match each set string at the end of the spanLength.
564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that start with set-contained code points must be matched
565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       with a partial overlap because the recursive algorithm would have tried
566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       to match them at every position.
567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that entirely consist of set-contained code points
568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       are irrelevant for span(USET_SPAN_CONTAINED) because the
569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       recursive algorithm would continue after them anyway
570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       and find the longest recursive match from their end.
571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Rather than recursing, note each end point of a set string match.
572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no set string matched after spanSet.span(), then return
573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     with where the spanSet.span() ended.
574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If at least one set string matched after spanSet.span(), then
575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     pop the shortest string match end point and continue
576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     the loop, trying to match all set strings from there.
577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If at least one more set string matched after a previous string match,
578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then test if the code point after the previous string match is also
579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     contained in the set.
580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     Continue the loop with the shortest end point of either this code point
581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     or a matching set string.
582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no more set string matched after a previous string match,
583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     Stop if spanLength==0, otherwise continue the loop.
585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * By noting each end point of a set string match,
587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the function visits each string position at most once and finishes
588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in linear time.
589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The recursive algorithm may visit the same string position many times
591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * if multiple paths lead to it and finishes in exponential time.
592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for span(USET_SPAN_SIMPLE)
596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm:
598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary:
599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the code point there is in the set, then remember to continue after it.
600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If a set string matches at the current position, then remember to continue after it.
601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Continue from the farthest match position and ignore all others.
602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If there is no match at the current position,
603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then stop and return the current position.
604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation:
606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption and spanSet as above.)
608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop:
611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Try to match each set string at the end of the spanLength.
612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that start with set-contained code points must be matched
613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       with a partial overlap because the standard algorithm would have tried
614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       to match them earlier.
615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Set strings that entirely consist of set-contained code points
616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       must be matched with a full overlap because the longest-match algorithm
617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       would hide set string matches that end earlier.
618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       Such set strings need not be matched earlier inside the code point span
619ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       because the standard algorithm would then have continued after
620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       the set string match anyway.
621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     ~ Remember the longest set string match (farthest end point) from the earliest
622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *       starting point.
623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no set string matched after spanSet.span(), then return
624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     with where the spanSet.span() ended.
625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If at least one set string matched, then continue the loop after the
626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     longest match from the earliest position.
627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If no more set string matched after a previous string match,
628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     Stop if spanLength==0, otherwise continue the loop.
630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
632ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNot(s, length);
635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanLength==length) {
638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length;
639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength16);
646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanLength, rest=length-pos;
648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanLengths[i];
653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest) {
672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the increment is not listed already.
675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(inc==rest) {
677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return length;  // Reached the end of the string.
678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(inc);
680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxInc=0, maxOverlap=0;
690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanLengths[i];
692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
693ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the earliest start.
694ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
695ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
698ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the earliest start.
704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest || overlap<maxOverlap) {
711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or starts earlier.
714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches16CPB(s, pos-overlap, length, s16, length16)
716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxInc=inc;  // Longest match from earliest start.
718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
726ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxInc!=0 || maxOverlap!=0) {
727ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
728ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue after it.
729ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=maxInc;
730ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=maxInc;
731ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(rest==0) {
732ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return length;  // Reached the end of the string.
733ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
734ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from after a string match.
735ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
736ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
737ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
738ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
739ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
740ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==0) {
741ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after an unlimited code point span (spanLength!=0),
742ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not after a string match.
743ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 after a span is pos==0.
744ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
745ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
746ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
747ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched after a span.
748ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
749ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from after the next string match.
750ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
751ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after a string match (or a single code point).
752ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
753ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched after a previous string match.
754ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from after the last string match.
755ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
756ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( spanLength==rest || // Reached the end of the string, or
757ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
758ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
759ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos+spanLength;
760ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
761ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=spanLength;
762ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=spanLength;
763ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from after a span.
764ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
765ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from after a string match if some
766ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
767ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
768ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOne(spanSet, s+pos, rest);
769ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
770ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==rest) {
771ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return length;  // Reached the end of the string.
772ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
773ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings after this code point.
774ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any increments below it because UnicodeSet strings
775ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
776ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos+=spanLength;
777ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    rest-=spanLength;
778ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
779ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
780ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from after a single code point.
781ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
782ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from after the next string match.
783ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
784ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
785ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t minOffset=offsets.popMinimum();
786ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=minOffset;
787ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=minOffset;
788ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from after a string match.
789ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
790ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
791ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
792ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
793ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
794ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNotBack(s, length);
795ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
796ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
797ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pos==0) {
798ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
799ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
800ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=length-pos;
801ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
802ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
803ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
804ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
805ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
806ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength16);
807ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
808ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
809ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackLengths=spanLengths;
810ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
811ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackLengths+=stringsLength;
812ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
813ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
814ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
815ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
816ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackLengths[i];
817ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
818ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
819ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
820ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
821ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
822ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
823ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
824ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length16-overlap)..pos-length16.
825ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
826ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
827ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
828ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len1=0;
829ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U16_FWD_1(s16, len1, overlap);
830ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap-=len1;  // Length of the string minus the first code point.
831ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
832ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
833ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
834ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
835ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
836ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
837ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos) {
838ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
839ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
840ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the decrement is not listed already.
841ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
842ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(dec==pos) {
843ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return 0;  // Reached the start of the string.
844ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
845ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(dec);
846ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
847ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
848ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
849ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
850ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
851ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
852ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
853ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
854ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
855ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxDec=0, maxOverlap=0;
856ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
857ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackLengths[i];
858ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
859ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the latest end.
860ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
861ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
862ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                const UChar *s16=string.getBuffer();
863ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t length16=string.length();
864ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
865ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length16-overlap)..pos-length16.
866ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
867ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length16;
868ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
869ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the latest end.
870ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
871ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
872ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
873ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
874ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
875ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
876ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos || overlap<maxOverlap) {
877ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
878ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
879ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or ends later.
880ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
881ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches16CPB(s, pos-dec, length, s16, length16)
882ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
883ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxDec=dec;  // Longest match from latest end.
884ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
885ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
886ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
887ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
888ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
889ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
890ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
891ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
892ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxDec!=0 || maxOverlap!=0) {
893ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
894ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue before it.
895ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos-=maxDec;
896ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(pos==0) {
897ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return 0;  // Reached the start of the string.
898ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
899ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from before a string match.
900ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
901ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
902ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
903ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
904ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
905ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==length) {
906ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before an unlimited code point span (spanLength!=0),
907ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not before a string match.
908ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 before a span is pos==length.
909ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
910ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
911ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
912ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched before a span.
913ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
914ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from before the next string match.
915ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
916ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before a string match (or a single code point).
917ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
918ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched before a previous string match.
919ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from before the last string match.
920ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t oldPos=pos;
921ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
922ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=oldPos-pos;
923ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( pos==0 ||           // Reached the start of the string, or
924ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
925ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
926ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos;
927ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
928ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from before a span.
929ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
930ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from before a string match if some
931ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
932ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
933ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOneBack(spanSet, s, pos);
934ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
935ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==pos) {
936ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return 0;  // Reached the start of the string.
937ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
938ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings before this code point.
939ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any decrements below it because UnicodeSet strings
940ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
941ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos-=spanLength;
942ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
943ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
944ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from before a single code point.
945ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
946ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from before the next string match.
947ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
948ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
949ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=offsets.popMinimum();
950ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from before a string match.
951ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
952ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
953ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
954ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
955ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
956ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNotUTF8(s, length);
957ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
958ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
959ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanLength==length) {
960ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return length;
961ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
962ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
963ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
964ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
965ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
966ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
967ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength8);
968ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
969ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanLength, rest=length-pos;
970ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
971ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanUTF8Lengths=spanLengths;
972ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
973ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanUTF8Lengths+=2*stringsLength;
974ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
975ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
976ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
977ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
978ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
979ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
980ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
981ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
982ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
983ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
984ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanUTF8Lengths[i];
985ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
986ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    s8+=length8;
987ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
988ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
989ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
990ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
991ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
992ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
993ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
994ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
995ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
996ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
997ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
998ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
999ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1000ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
1001ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest) {
1002ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1003ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1004ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the increment is not listed already.
1005ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1006ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1007ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1008ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        !offsets.containsOffset(inc) &&
1009ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-overlap, s8, length8)
1010ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1011ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1012ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(inc==rest) {
1013ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return length;  // Reached the end of the string.
1014ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
1015ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(inc);
1016ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1017ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
1018ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1019ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1020ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1021ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
1022ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1023ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1024ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1025ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
1026ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxInc=0, maxOverlap=0;
1027ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
1028ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
1029ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
1030ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
1031ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1032ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanUTF8Lengths[i];
1033ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
1034ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the earliest start.
1035ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1036ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-overlap..pos.
1037ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
1038ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
1039ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
1040ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the earliest start.
1041ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1042ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
1043ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
1044ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1045ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1046ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
1047ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(inc>rest || overlap<maxOverlap) {
1048ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1049ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1050ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or starts earlier.
1051ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1052ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1053ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1054ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1055ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-overlap, s8, length8)
1056ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1057ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1058ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxInc=inc;  // Longest match from earliest start.
1059ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
1060ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1061ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1062ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1063ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++inc;
1064ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1065ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1066ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1067ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1068ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxInc!=0 || maxOverlap!=0) {
1069ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
1070ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue after it.
1071ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=maxInc;
1072ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=maxInc;
1073ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(rest==0) {
1074ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return length;  // Reached the end of the string.
1075ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1076ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from after a string match.
1077ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
1078ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1079ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1080ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
1081ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1082ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==0) {
1083ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after an unlimited code point span (spanLength!=0),
1084ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not after a string match.
1085ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 after a span is pos==0.
1086ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
1087ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
1088ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1089ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched after a span.
1090ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1091ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from after the next string match.
1092ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
1093ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is after a string match (or a single code point).
1094ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1095ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched after a previous string match.
1096ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from after the last string match.
1097ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1098ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( spanLength==rest || // Reached the end of the string, or
1099ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
1100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
1101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos+spanLength;
1102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos+=spanLength;
1104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                rest-=spanLength;
1105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from after a span.
1106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
1107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from after a string match if some
1108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
1109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
1110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOneUTF8(spanSet, s+pos, rest);
1111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
1112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==rest) {
1113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return length;  // Reached the end of the string.
1114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings after this code point.
1116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any increments below it because UnicodeSet strings
1117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
1118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos+=spanLength;
1119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    rest-=spanLength;
1120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
1121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
1122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from after a single code point.
1123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from after the next string match.
1125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t minOffset=offsets.popMinimum();
1128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=minOffset;
1129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=minOffset;
1130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from after a string match.
1131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return spanNotBackUTF8(s, length);
1137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(pos==0) {
1140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
1141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spanLength=length-pos;
1143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Consider strings; they may overlap with the span.
1145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OffsetList offsets;
1146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition==USET_SPAN_CONTAINED) {
1147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Use offset list to try all possibilities.
1148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offsets.setMaxLength(maxLength8);
1149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackUTF8Lengths=spanLengths;
1152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
1153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackUTF8Lengths+=3*stringsLength;
1154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
1156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
1157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
1158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition==USET_SPAN_CONTAINED) {
1159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
1160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
1161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
1162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
1163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackUTF8Lengths[i];
1165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap==ALL_CP_CONTAINED) {
1166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    s8+=length8;
1167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Irrelevant string.
1168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length8-overlap)..pos-length8.
1171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
1172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
1173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // While contained: No point matching fully inside the code point span.
1174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    int32_t len1=0;
1175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    U8_FWD_1(s8, len1, overlap);
1176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap-=len1;  // Length of the string minus the first code point.
1177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
1179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
1180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
1183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos) {
1184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the decrement is not listed already.
1187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        !offsets.containsOffset(dec) &&
1191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-dec, s8, length8)
1192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(dec==pos) {
1194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return 0;  // Reached the start of the string.
1195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
1196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        offsets.addOffset(dec);
1197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(overlap==0) {
1199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
1203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* USET_SPAN_SIMPLE */ {
1207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t maxDec=0, maxOverlap=0;
1208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(i=0; i<stringsLength; ++i) {
1209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                length8=utf8Lengths[i];
1210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(length8==0) {
1211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // String not representable in UTF-8.
1212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t overlap=spanBackUTF8Lengths[i];
1214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // For longest match, we do need to try to match even an all-contained string
1215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // to find the match from the latest end.
1216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match this string at pos-(length8-overlap)..pos-length8.
1218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>=LONG_SPAN) {
1219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=length8;
1220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Longest match: Need to match fully inside the code point span
1221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // to find the match from the latest end.
1222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(overlap>spanLength) {
1224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    overlap=spanLength;
1225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for(;;) {
1228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(dec>pos || overlap<maxOverlap) {
1229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Try to match if the string is longer or ends later.
1232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match at code point boundaries. (The UTF-8 strings were converted
1233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // from UTF-16 and are guaranteed to be well-formed.)
1234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        matches8(s+pos-dec, s8, length8)
1237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) {
1238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxDec=dec;  // Longest match from latest end.
1239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        maxOverlap=overlap;
1240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
1241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    --overlap;
1243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ++dec;
1244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s8+=length8;
1246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(maxDec!=0 || maxOverlap!=0) {
1249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Longest-match algorithm, and there was a string match.
1250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Simply continue before it.
1251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos-=maxDec;
1252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(pos==0) {
1253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return 0;  // Reached the start of the string.
1254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=0;  // Match strings from before a string match.
1256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
1257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Finished trying to match all strings at pos.
1260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanLength!=0 || pos==length) {
1262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before an unlimited code point span (spanLength!=0),
1263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // not before a string match.
1264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The only position where spanLength==0 before a span is pos==length.
1265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Otherwise, an unlimited code point span is only tried again when no
1266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // strings match, and if such a non-initial span fails we stop.
1267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // No strings matched before a span.
1269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Match strings from before the next string match.
1271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
1272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // The position is before a string match (or a single code point).
1273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(offsets.isEmpty()) {
1274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // No more strings matched before a previous string match.
1275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try another code point span from before the last string match.
1276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t oldPos=pos;
1277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=oldPos-pos;
1279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( pos==0 ||           // Reached the start of the string, or
1280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength==0       // neither strings nor span progressed.
1281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
1282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return pos;
1283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // spanLength>0: Match strings from before a span.
1285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
1286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Try to match only one code point from before a string match if some
1287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // string matched beyond it, so that we try all possible positions
1288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // and don't overshoot.
1289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                spanLength=spanOneBackUTF8(spanSet, s, pos);
1290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(spanLength>0) {
1291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(spanLength==pos) {
1292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return 0;  // Reached the start of the string.
1293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
1294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Match strings before this code point.
1295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // There cannot be any decrements below it because UnicodeSet strings
1296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // contain multiple code points.
1297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pos-=spanLength;
1298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    offsets.shift(spanLength);
1299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    spanLength=0;
1300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;  // Match strings from before a single code point.
1301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
1302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Match strings from before the next string match.
1303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=offsets.popMinimum();
1306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanLength=0;  // Match strings from before a string match.
1307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
1311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Theoretical algorithm:
1314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Iterate through the string, and at each code point boundary:
1315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the code point there is in the set, then return with the current position.
1316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If a set string matches at the current position, then return with the current position.
1317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimized implementation:
1319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Same assumption as for span() above.)
1321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Create and cache a spanNotSet which contains all of the single code points
1323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * of the original set but none of its strings.
1324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For each set string add its initial code point to the spanNotSet.
1325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (Also add its final code point for spanNotBack().)
1326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - Loop:
1328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If the current code point is in the original set, then
1330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     return the current position.
1331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If any set string matches at the current position, then
1332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     return the current position.
1333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   + If there is no match at the current position, neither for the code point there
1334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     nor for any set string, then skip this code point and continue the loop.
1335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     This happens for set-string-initial code points that were added to spanNotSet
1336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *     when there is not actually a match for such a set string.
1337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
1338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=0, rest=length;
1341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i==rest) {
1347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return length;  // Reached the end of the string.
1348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=i;
1350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=i;
1351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOne(spanSet, s+pos, rest);
1355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(spanLengths[i]==ALL_CP_CONTAINED) {
1362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // Irrelevant string.
1363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UChar *s16=string.getBuffer();
1366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t length16=string.length();
1367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=cpLength;
1376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest+=cpLength;
1377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(rest!=0);
1378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return length;  // Reached the end of the string.
1379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=length;
1383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(pos==0) {
1389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return 0;  // Reached the start of the string.
1390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOneBack(spanSet, s, pos);
1395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Use spanLengths rather than a spanBackLengths pointer because
1402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // it is easier and we only need to know whether the string is irrelevant
1403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // which is the same in either array.
1404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(spanLengths[i]==ALL_CP_CONTAINED) {
1405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;  // Irrelevant string.
1406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            const UChar *s16=string.getBuffer();
1409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int32_t length16=string.length();
1410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=cpLength;
1419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(pos!=0);
1420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;  // Reached the start of the string.
1421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=0, rest=length;
1425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanUTF8Lengths=spanLengths;
1427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
1428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanUTF8Lengths+=2*stringsLength;
1429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(i==rest) {
1435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return length;  // Reached the end of the string.
1436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=i;
1438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest-=i;
1439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
1449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
1450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            length8=utf8Lengths[i];
1452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // ALL_CP_CONTAINED: Irrelevant string.
1453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            s8+=length8;
1457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos-=cpLength;
1463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rest+=cpLength;
1464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(rest!=0);
1465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return length;  // Reached the end of the string.
1466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t pos=length;
1470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, stringsLength=strings.size();
1471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t *spanBackUTF8Lengths=spanLengths;
1472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(all) {
1473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanBackUTF8Lengths+=3*stringsLength;
1474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
1475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
1476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Span until we find a code point from the set,
1477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // or a code point that starts or ends some string.
1478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(pos==0) {
1480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return 0;  // Reached the start of the string.
1481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Check whether the current code point is in the original set,
1484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // without the string starts and ends.
1485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cpLength>0) {
1487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return pos;  // There is a set element at pos.
1488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Try to match the strings at pos.
1491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        const uint8_t *s8=utf8;
1492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length8;
1493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<stringsLength; ++i) {
1494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            length8=utf8Lengths[i];
1495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // ALL_CP_CONTAINED: Irrelevant string.
1496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return pos;  // There is a set element at pos.
1498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
1499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            s8+=length8;
1500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
1501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The span(while not contained) ended on a string start/end which is
1503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // not in the original set. Skip this code point and continue.
1504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // cpLength<0
1505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pos+=cpLength;
1506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(pos!=0);
1507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;  // Reached the start of the string.
1508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
1509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
1511