1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)******************************************************************************
3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*
4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   Copyright (C) 2007, International Business Machines
5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   Corporation and others.  All Rights Reserved.
6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*
7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)******************************************************************************
8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   file name:  unisetspan.cpp
9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   encoding:   US-ASCII
10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   tab size:   8 (not used)
11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   indentation:4
12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*
13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   created on: 2007mar01
14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*   created by: Markus W. Scherer
15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/
16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/utypes.h"
18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uniset.h"
19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ustring.h"
20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "cmemory.h"
21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uvector.h"
22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unisetspan.h"
23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_BEGIN
25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * List of offsets from the current position from where to try matching
28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * a code point or a string.
29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Store offsets rather than indexes to simplify the code and use the same list
30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * for both increments (in span()) and decrements (in spanBack()).
31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Assumption: The maximum offset is limited, and the offsets that are stored
33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * at any one time are relatively dense, that is, there are normally no gaps of
34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * hundreds or thousands of offset values.
35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The implementation uses a circular buffer of byte flags,
37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * each indicating whether the corresponding offset is in the list.
38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This avoids inserting into a sorted list of offsets (or absolute indexes) and
39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * physically moving part of the list.
40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Note: In principle, the caller should setMaxLength() to the maximum of the
42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * max string length and U16_LENGTH/U8_LENGTH to account for
43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "long" single code points.
44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * However, this implementation uses at least a staticList with more than
45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * U8_LENGTH entries anyway.
46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Note: If maxLength were guaranteed to be no more than 32 or 64,
48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the list could be stored as bit flags in a single integer.
49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Rather than handling a circular buffer with a start list index,
50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the integer would simply be shifted when lower offsets are removed.
51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * UnicodeSet does not have a limit on the lengths of strings.
52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */
53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)public:
55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    ~OffsetList() {
58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(list!=staticList) {
59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            uprv_free(list);
60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Call exactly once if the list is to be used.
64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    void setMaxLength(int32_t maxLength) {
65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(maxLength<=(int32_t)sizeof(staticList)) {
66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            capacity=(int32_t)sizeof(staticList);
67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            UBool *l=(UBool *)uprv_malloc(maxLength);
69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(l!=NULL) {
70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                list=l;
71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                capacity=maxLength;
72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        uprv_memset(list, 0, capacity);
75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    void clear() {
78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        uprv_memset(list, 0, capacity);
79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        start=length=0;
80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UBool isEmpty() const {
83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return (UBool)(length==0);
84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Reduce all stored offsets by delta, used when the current position
87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // moves by delta.
88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // There must not be any offsets lower than delta.
89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // If there is an offset equal to delta, it is removed.
90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // delta=[1..maxLength]
91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    void shift(int32_t delta) {
92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t i=start+delta;
93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(i>=capacity) {
94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            i-=capacity;
95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(list[i]) {
97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            list[i]=FALSE;
98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            --length;
99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        start=i;
101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Add an offset. The list must not contain it yet.
104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // offset=[1..maxLength]
105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    void addOffset(int32_t offset) {
106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t i=start+offset;
107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(i>=capacity) {
108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            i-=capacity;
109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        list[i]=TRUE;
111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        ++length;
112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // offset=[1..maxLength]
115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UBool containsOffset(int32_t offset) const {
116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t i=start+offset;
117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(i>=capacity) {
118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            i-=capacity;
119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return list[i];
121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Find the lowest stored offset from a non-empty list, remove it,
124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // and reduce all other offsets by this minimum.
125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Returns [1..maxLength].
126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t popMinimum() {
127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Look for the next offset in list[start+1..capacity-1].
128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t i=start, result;
129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        while(++i<capacity) {
130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(list[i]) {
131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                list[i]=FALSE;
132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                --length;
133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                result=i-start;
134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                start=i;
135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return result;
136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // i==capacity
139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Wrap around and look for the next offset in list[0..start].
141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Since the list is not empty, there will be one.
142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        result=capacity-start;
143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        i=0;
144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        while(!list[i]) {
145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            ++i;
146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        list[i]=FALSE;
148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        --length;
149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        start=i;
150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return result+=i;
151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)private:
154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UBool *list;
155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t capacity;
156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t length;
157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t start;
158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UBool staticList[16];
160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)};
161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static int32_t
164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)getUTF8Length(const UChar *s, int32_t length) {
165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UErrorCode errorCode=U_ZERO_ERROR;
166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t length8=0;
167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return length8;
170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } else {
171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // The string contains an unpaired surrogate.
172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Ignore this string.
173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return 0;
174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static int32_t
179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UErrorCode errorCode=U_ZERO_ERROR;
181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t length8=0;
182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(U_SUCCESS(errorCode)) {
184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return length8;
185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } else {
186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // The string contains an unpaired surrogate.
187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Ignore this string.
188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return 0;
189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline uint8_t
193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)makeSpanLengthByte(int32_t spanLength) {
194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Construct for all variants of span(), or only for any one variant.
199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Initialize as little as possible, for single use.
200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                                           const UVector &setStrings,
202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                                           uint32_t which)
203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          utf8Length(0),
206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          maxLength16(0), maxLength8(0),
207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          all((UBool)(which==ALL)) {
208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    spanSet.retainAll(set);
209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(which&NOT_CONTAINED) {
210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Default to the same sets.
211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // addToSpanNotSet() will create a separate set if necessary.
212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pSpanNotSet=&spanSet;
213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Determine if the strings even need to be taken into account at all for span() etc.
216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // If any string is relevant, then all strings need to be used for
217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // span(longest match) but only the relevant ones for span(while contained).
218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t stringsLength=strings.size();
223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, spanLength;
225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UBool someRelevant=FALSE;
226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    for(i=0; i<stringsLength; ++i) {
227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const UChar *s16=string.getBuffer();
229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t length16=string.length();
230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        UBool thisRelevant;
231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanLength<length16) {  // Relevant string.
233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            someRelevant=thisRelevant=TRUE;
234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            thisRelevant=FALSE;
236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if((which&UTF16) && length16>maxLength16) {
238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            maxLength16=length16;
239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            int32_t length8=getUTF8Length(s16, length16);
242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            utf8Length+=length8;
243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(length8>maxLength8) {
244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                maxLength8=length8;
245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(!someRelevant) {
249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        maxLength16=maxLength8=0;
250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return;
251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Freeze after checking for the need to use strings at all because freezing
254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // a set takes some time and memory which are wasted if there are no relevant strings.
255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanSet.freeze();
257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanBackLengths;
260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanUTF8Lengths;
261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanBackUTF8Lengths;
262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Allocate a block of meta data.
264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t allocSize;
265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } else {
269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        allocSize=stringsLength;  // One set of span lengths.
270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(which&UTF8) {
271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // UTF-8 lengths and UTF-8 strings.
272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            allocSize+=stringsLength*4+utf8Length;
273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(allocSize<=(int32_t)sizeof(staticLengths)) {
276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        utf8Lengths=staticLengths;
277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } else {
278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(utf8Lengths==NULL) {
280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return;  // Out of memory.
282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Store span lengths for all span() variants.
287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanBackLengths=spanLengths+stringsLength;
289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanUTF8Lengths=spanBackLengths+stringsLength;
290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        utf8=spanBackUTF8Lengths+stringsLength;
292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } else {
293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Store span lengths for only one span() variant.
294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(which&UTF8) {
295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            utf8=spanLengths+stringsLength;
297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            spanLengths=(uint8_t *)utf8Lengths;
299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    for(i=0; i<stringsLength; ++i) {
307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const UChar *s16=string.getBuffer();
309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t length16=string.length();
310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanLength<length16) {  // Relevant string.
312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(which&UTF16) {
313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(which&CONTAINED) {
314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(which&FWD) {
315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        spanLengths[i]=makeSpanLengthByte(spanLength);
316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(which&BACK) {
318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(which&UTF8) {
326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                uint8_t *s8=utf8+utf8Count;
327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                utf8Count+=utf8Lengths[i]=length8;
329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                } else {  // Relevant for UTF-8.
332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(which&CONTAINED) {
333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        if(which&FWD) {
334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        }
337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        if(which&BACK) {
338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        }
341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(which&NOT_CONTAINED) {
347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Add string start and end code points to the spanNotSet so that
348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // a span(while not contained) stops before any string.
349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                UChar32 c;
350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(which&FWD) {
351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    int32_t len=0;
352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    U16_NEXT(s16, len, length16, c);
353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    addToSpanNotSet(c);
354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(which&BACK) {
356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    int32_t len=length16;
357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    U16_PREV(s16, 0, len, c);
358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    addToSpanNotSet(c);
359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {  // Irrelevant string.
362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(which&UTF8) {
363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    uint8_t *s8=utf8+utf8Count;
365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    utf8Count+=utf8Lengths[i]=length8;
367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                } else {
368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    utf8Lengths[i]=0;
369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(all) {
372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLengths[i]=spanBackLengths[i]=
373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        (uint8_t)ALL_CP_CONTAINED;
375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            } else {
376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // All spanXYZLengths pointers contain the same address.
377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Finish.
383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pSpanNotSet->freeze();
385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Copy constructor. Assumes which==ALL for a frozen set.
389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                                           const UVector &newParentSetStrings)
391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          utf8Length(otherStringSpan.utf8Length),
394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)          all(TRUE) {
396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pSpanNotSet=&spanSet;
398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } else {
399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Allocate a block of meta data.
403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t stringsLength=strings.size();
405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(allocSize<=(int32_t)sizeof(staticLengths)) {
407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        utf8Lengths=staticLengths;
408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } else {
409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(utf8Lengths==NULL) {
411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return;  // Out of memory.
413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    utf8=spanLengths+stringsLength*4;
418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeSetStringSpan::~UnicodeSetStringSpan() {
422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        delete pSpanNotSet;
424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        uprv_free(utf8Lengths);
427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanSet.contains(c)) {
433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return;  // Nothing to do.
434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(newSet==NULL) {
437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return;  // Out of memory.
438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            pSpanNotSet=newSet;
440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    pSpanNotSet->add(c);
443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Compare strings without any argument checks. Requires length>0.
446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline UBool
447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)matches16(const UChar *s, const UChar *t, int32_t length) {
448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    do {
449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(*s++!=*t++) {
450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return FALSE;
451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } while(--length>0);
453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return TRUE;
454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline UBool
457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    do {
459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(*s++!=*t++) {
460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return FALSE;
461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } while(--length>0);
463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return TRUE;
464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Compare 16-bit Unicode strings (which may be malformed UTF-16)
467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// at code point boundaries.
468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// That is, each edge of a match must not be in the middle of a surrogate pair.
469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline UBool
470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    s+=start;
472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    limit-=start;
473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return matches16(s, t, length) &&
474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Does the set contain the next code point?
479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// If so, return its length; otherwise return its negative length.
480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t
481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UChar c=*s, c2;
483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return set.contains(c) ? 1 : -1;
487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t
490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UChar c=s[length-1], c2;
492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return set.contains(c) ? 1 : -1;
496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t
499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UChar32 c=*s;
501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if((int8_t)c>=0) {
502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return set.contains(c) ? 1 : -1;
503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Take advantage of non-ASCII fastpaths in U8_NEXT().
505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i=0;
506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    U8_NEXT(s, i, length, c);
507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return set.contains(c) ? i : -i;
508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
509f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
510f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static inline int32_t
511f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
512f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    UChar32 c=s[length-1];
513f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if((int8_t)c>=0) {
514f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return set.contains(c) ? 1 : -1;
515f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
516f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i=length-1;
517f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
518f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    length-=i;
519f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return set.contains(c) ? length : -length;
520f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
521f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
522f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
523f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Note: In span() when spanLength==0 (after a string match, or at the beginning
524f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * after an empty code point span) and in spanNot() and spanNotUTF8(),
525f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * string matching could use a binary search
526f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * because all string matches are done from the same start index.
527f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
528f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * For UTF-8, this would require a comparison function that returns UTF-16 order.
529f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
530f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This optimization should not be necessary for normal UnicodeSets because
531f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * most sets have no strings, and most sets with strings have
532f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * very few very short strings.
533f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * For cases with many strings, it might be better to use a different API
534f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and implementation with a DFA (state machine).
535f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */
536f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
537f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
538f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Algorithm for span(USET_SPAN_CONTAINED)
539f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
540f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Theoretical algorithm:
541f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Iterate through the string, and at each code point boundary:
542f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If the code point there is in the set, then remember to continue after it.
543f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If a set string matches at the current position, then remember to continue after it.
544f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + Either recursively span for each code point or string match,
545f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     or recursively span for all but the shortest one and
546f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     iteratively continue the span with the shortest local match.
547f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + Remember the longest recursive span (the farthest end point).
548f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If there is no match at the current position, neither for the code point there
549f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     nor for any set string, then stop and return the longest recursive span length.
550f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
551f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Optimized implementation:
552f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
553f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (We assume that most sets will have very few very short strings.
554f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * A span using a string-less set is extremely fast.)
555f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
556f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Create and cache a spanSet which contains all of the single code points
557f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * of the original set but none of its strings.
558f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
559f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
560f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Loop:
561f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + Try to match each set string at the end of the spanLength.
562f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     ~ Set strings that start with set-contained code points must be matched
563f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       with a partial overlap because the recursive algorithm would have tried
564f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       to match them at every position.
565f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     ~ Set strings that entirely consist of set-contained code points
566f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       are irrelevant for span(USET_SPAN_CONTAINED) because the
567f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       recursive algorithm would continue after them anyway
568f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       and find the longest recursive match from their end.
569f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     ~ Rather than recursing, note each end point of a set string match.
570f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If no set string matched after spanSet.span(), then return
571f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     with where the spanSet.span() ended.
572f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If at least one set string matched after spanSet.span(), then
573f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     pop the shortest string match end point and continue
574f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     the loop, trying to match all set strings from there.
575f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If at least one more set string matched after a previous string match,
576f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     then test if the code point after the previous string match is also
577f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     contained in the set.
578f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     Continue the loop with the shortest end point of either this code point
579f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     or a matching set string.
580f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If no more set string matched after a previous string match,
581f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
582f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     Stop if spanLength==0, otherwise continue the loop.
583f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
584f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * By noting each end point of a set string match,
585f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the function visits each string position at most once and finishes
586f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * in linear time.
587f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
588f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The recursive algorithm may visit the same string position many times
589f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * if multiple paths lead to it and finishes in exponential time.
590f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */
591f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
592f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
593f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Algorithm for span(USET_SPAN_SIMPLE)
594f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
595f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Theoretical algorithm:
596f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Iterate through the string, and at each code point boundary:
597f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If the code point there is in the set, then remember to continue after it.
598f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If a set string matches at the current position, then remember to continue after it.
599f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + Continue from the farthest match position and ignore all others.
600f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If there is no match at the current position,
601f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     then stop and return the current position.
602f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
603f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Optimized implementation:
604f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
605f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Same assumption and spanSet as above.)
606f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
607f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
608f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Loop:
609f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + Try to match each set string at the end of the spanLength.
610f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     ~ Set strings that start with set-contained code points must be matched
611f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       with a partial overlap because the standard algorithm would have tried
612f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       to match them earlier.
613f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     ~ Set strings that entirely consist of set-contained code points
614f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       must be matched with a full overlap because the longest-match algorithm
615f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       would hide set string matches that end earlier.
616f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       Such set strings need not be matched earlier inside the code point span
617f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       because the standard algorithm would then have continued after
618f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       the set string match anyway.
619f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     ~ Remember the longest set string match (farthest end point) from the earliest
620f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *       starting point.
621f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If no set string matched after spanSet.span(), then return
622f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     with where the spanSet.span() ended.
623f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If at least one set string matched, then continue the loop after the
624f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     longest match from the earliest position.
625f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If no more set string matched after a previous string match,
626f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
627f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     Stop if spanLength==0, otherwise continue the loop.
628f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */
629f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
630f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
631f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
632f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return spanNot(s, length);
633f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
634f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
635f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanLength==length) {
636f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return length;
637f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
638f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
639f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Consider strings; they may overlap with the span.
640f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    OffsetList offsets;
641f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_CONTAINED) {
642f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Use offset list to try all possibilities.
643f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        offsets.setMaxLength(maxLength16);
644f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
645f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=spanLength, rest=length-pos;
646f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
647f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    for(;;) {
648f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanCondition==USET_SPAN_CONTAINED) {
649f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
650f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanLengths[i];
651f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap==ALL_CP_CONTAINED) {
652f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Irrelevant string.
653f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
654f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
655f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UChar *s16=string.getBuffer();
656f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t length16=string.length();
657f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
658f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-overlap..pos.
659f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
660f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length16;
661f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // While contained: No point matching fully inside the code point span.
662f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
663f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
664f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
665f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
666f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
667f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
668f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
669f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(inc>rest) {
670f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
671f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
672f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the increment is not listed already.
673f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
674f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        if(inc==rest) {
675f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            return length;  // Reached the end of the string.
676f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        }
677f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        offsets.addOffset(inc);
678f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
679f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(overlap==0) {
680f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
681f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
682f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
683f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++inc;
684f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
685f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
686f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else /* USET_SPAN_SIMPLE */ {
687f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            int32_t maxInc=0, maxOverlap=0;
688f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
689f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanLengths[i];
690f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // For longest match, we do need to try to match even an all-contained string
691f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // to find the match from the earliest start.
692f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
693f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
694f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UChar *s16=string.getBuffer();
695f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t length16=string.length();
696f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
697f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-overlap..pos.
698f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
699f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length16;
700f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Longest match: Need to match fully inside the code point span
701f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // to find the match from the earliest start.
702f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
703f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
704f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
705f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
706f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
707f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
708f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(inc>rest || overlap<maxOverlap) {
709f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
710f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
711f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the string is longer or starts earlier.
712f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
713f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        matches16CPB(s, pos-overlap, length, s16, length16)
714f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ) {
715f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxInc=inc;  // Longest match from earliest start.
716f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxOverlap=overlap;
717f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
718f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
719f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
720f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++inc;
721f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
722f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
723f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
724f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(maxInc!=0 || maxOverlap!=0) {
725f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Longest-match algorithm, and there was a string match.
726f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Simply continue after it.
727f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos+=maxInc;
728f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                rest-=maxInc;
729f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(rest==0) {
730f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return length;  // Reached the end of the string.
731f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
732f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=0;  // Match strings from after a string match.
733f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;
734f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
735f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
736f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Finished trying to match all strings at pos.
737f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
738f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanLength!=0 || pos==0) {
739f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is after an unlimited code point span (spanLength!=0),
740f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // not after a string match.
741f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The only position where spanLength==0 after a span is pos==0.
742f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Otherwise, an unlimited code point span is only tried again when no
743f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // strings match, and if such a non-initial span fails we stop.
744f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
745f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // No strings matched after a span.
746f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
747f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Match strings from after the next string match.
748f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
749f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is after a string match (or a single code point).
750f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
751f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // No more strings matched after a previous string match.
752f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try another code point span from after the last string match.
753f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
754f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if( spanLength==rest || // Reached the end of the string, or
755f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength==0       // neither strings nor span progressed.
756f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                ) {
757f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return pos+spanLength;
758f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
759f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos+=spanLength;
760f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                rest-=spanLength;
761f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;  // spanLength>0: Match strings from after a span.
762f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            } else {
763f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match only one code point from after a string match if some
764f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // string matched beyond it, so that we try all possible positions
765f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // and don't overshoot.
766f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=spanOne(spanSet, s+pos, rest);
767f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(spanLength>0) {
768f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(spanLength==rest) {
769f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        return length;  // Reached the end of the string.
770f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
771f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match strings after this code point.
772f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // There cannot be any increments below it because UnicodeSet strings
773f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // contain multiple code points.
774f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    pos+=spanLength;
775f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    rest-=spanLength;
776f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    offsets.shift(spanLength);
777f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength=0;
778f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Match strings from after a single code point.
779f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
780f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Match strings from after the next string match.
781f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
782f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
783f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t minOffset=offsets.popMinimum();
784f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos+=minOffset;
785f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        rest-=minOffset;
786f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanLength=0;  // Match strings from after a string match.
787f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
788f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
789f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
790f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
791f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
792f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return spanNotBack(s, length);
793f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
794f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
795f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(pos==0) {
796f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return 0;
797f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
798f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t spanLength=length-pos;
799f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
800f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Consider strings; they may overlap with the span.
801f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    OffsetList offsets;
802f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_CONTAINED) {
803f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Use offset list to try all possibilities.
804f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        offsets.setMaxLength(maxLength16);
805f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
806f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
807f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanBackLengths=spanLengths;
808f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
809f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanBackLengths+=stringsLength;
810f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
811f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    for(;;) {
812f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanCondition==USET_SPAN_CONTAINED) {
813f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
814f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanBackLengths[i];
815f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap==ALL_CP_CONTAINED) {
816f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Irrelevant string.
817f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
818f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
819f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UChar *s16=string.getBuffer();
820f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t length16=string.length();
821f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
822f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-(length16-overlap)..pos-length16.
823f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
824f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length16;
825f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // While contained: No point matching fully inside the code point span.
826f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    int32_t len1=0;
827f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    U16_FWD_1(s16, len1, overlap);
828f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap-=len1;  // Length of the string minus the first code point.
829f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
830f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
831f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
832f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
833f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
834f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
835f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(dec>pos) {
836f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
837f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
838f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the decrement is not listed already.
839f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
840f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        if(dec==pos) {
841f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            return 0;  // Reached the start of the string.
842f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        }
843f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        offsets.addOffset(dec);
844f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
845f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(overlap==0) {
846f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
847f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
848f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
849f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++dec;
850f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
851f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
852f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else /* USET_SPAN_SIMPLE */ {
853f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            int32_t maxDec=0, maxOverlap=0;
854f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
855f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanBackLengths[i];
856f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // For longest match, we do need to try to match even an all-contained string
857f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // to find the match from the latest end.
858f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
859f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
860f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                const UChar *s16=string.getBuffer();
861f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t length16=string.length();
862f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
863f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-(length16-overlap)..pos-length16.
864f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
865f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length16;
866f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Longest match: Need to match fully inside the code point span
867f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // to find the match from the latest end.
868f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
869f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
870f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
871f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
872f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
873f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
874f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(dec>pos || overlap<maxOverlap) {
875f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
876f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
877f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the string is longer or ends later.
878f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
879f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        matches16CPB(s, pos-dec, length, s16, length16)
880f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ) {
881f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxDec=dec;  // Longest match from latest end.
882f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxOverlap=overlap;
883f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
884f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
885f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
886f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++dec;
887f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
888f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
889f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
890f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(maxDec!=0 || maxOverlap!=0) {
891f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Longest-match algorithm, and there was a string match.
892f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Simply continue before it.
893f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos-=maxDec;
894f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(pos==0) {
895f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return 0;  // Reached the start of the string.
896f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
897f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=0;  // Match strings from before a string match.
898f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;
899f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
900f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
901f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Finished trying to match all strings at pos.
902f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
903f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanLength!=0 || pos==length) {
904f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is before an unlimited code point span (spanLength!=0),
905f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // not before a string match.
906f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The only position where spanLength==0 before a span is pos==length.
907f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Otherwise, an unlimited code point span is only tried again when no
908f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // strings match, and if such a non-initial span fails we stop.
909f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
910f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // No strings matched before a span.
911f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
912f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Match strings from before the next string match.
913f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
914f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is before a string match (or a single code point).
915f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
916f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // No more strings matched before a previous string match.
917f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try another code point span from before the last string match.
918f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t oldPos=pos;
919f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
920f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=oldPos-pos;
921f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if( pos==0 ||           // Reached the start of the string, or
922f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength==0       // neither strings nor span progressed.
923f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                ) {
924f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return pos;
925f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
926f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;  // spanLength>0: Match strings from before a span.
927f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            } else {
928f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match only one code point from before a string match if some
929f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // string matched beyond it, so that we try all possible positions
930f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // and don't overshoot.
931f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=spanOneBack(spanSet, s, pos);
932f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(spanLength>0) {
933f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(spanLength==pos) {
934f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        return 0;  // Reached the start of the string.
935f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
936f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match strings before this code point.
937f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // There cannot be any decrements below it because UnicodeSet strings
938f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // contain multiple code points.
939f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    pos-=spanLength;
940f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    offsets.shift(spanLength);
941f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength=0;
942f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Match strings from before a single code point.
943f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
944f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Match strings from before the next string match.
945f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
946f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
947f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos-=offsets.popMinimum();
948f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanLength=0;  // Match strings from before a string match.
949f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
950f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
951f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
952f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
953f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
954f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return spanNotUTF8(s, length);
955f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
956f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
957f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanLength==length) {
958f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return length;
959f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
960f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
961f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Consider strings; they may overlap with the span.
962f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    OffsetList offsets;
963f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_CONTAINED) {
964f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Use offset list to try all possibilities.
965f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        offsets.setMaxLength(maxLength8);
966f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
967f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=spanLength, rest=length-pos;
968f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
969f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanUTF8Lengths=spanLengths;
970f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
971f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanUTF8Lengths+=2*stringsLength;
972f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
973f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    for(;;) {
974f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const uint8_t *s8=utf8;
975f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t length8;
976f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanCondition==USET_SPAN_CONTAINED) {
977f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
978f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                length8=utf8Lengths[i];
979f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(length8==0) {
980f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // String not representable in UTF-8.
981f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
982f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanUTF8Lengths[i];
983f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap==ALL_CP_CONTAINED) {
984f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    s8+=length8;
985f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Irrelevant string.
986f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
987f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
988f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-overlap..pos.
989f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
990f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length8;
991f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // While contained: No point matching fully inside the code point span.
992f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
993f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
994f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
995f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
996f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
997f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
998f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
999f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(inc>rest) {
1000f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1001f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1002f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the increment is not listed already.
1003f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match at code point boundaries. (The UTF-8 strings were converted
1004f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // from UTF-16 and are guaranteed to be well-formed.)
1005f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1006f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        !offsets.containsOffset(inc) &&
1007f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        matches8(s+pos-overlap, s8, length8)
1008f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1009f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ) {
1010f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        if(inc==rest) {
1011f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            return length;  // Reached the end of the string.
1012f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        }
1013f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        offsets.addOffset(inc);
1014f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1015f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(overlap==0) {
1016f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1017f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1018f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
1019f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++inc;
1020f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1021f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                s8+=length8;
1022f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1023f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else /* USET_SPAN_SIMPLE */ {
1024f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            int32_t maxInc=0, maxOverlap=0;
1025f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
1026f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                length8=utf8Lengths[i];
1027f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(length8==0) {
1028f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // String not representable in UTF-8.
1029f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1030f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanUTF8Lengths[i];
1031f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // For longest match, we do need to try to match even an all-contained string
1032f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // to find the match from the earliest start.
1033f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1034f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-overlap..pos.
1035f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
1036f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length8;
1037f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Longest match: Need to match fully inside the code point span
1038f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // to find the match from the earliest start.
1039f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1040f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
1041f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
1042f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1043f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1044f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
1045f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(inc>rest || overlap<maxOverlap) {
1046f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1047f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1048f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the string is longer or starts earlier.
1049f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match at code point boundaries. (The UTF-8 strings were converted
1050f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // from UTF-16 and are guaranteed to be well-formed.)
1051f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1052f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1053f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        matches8(s+pos-overlap, s8, length8)
1054f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1055f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ) {
1056f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxInc=inc;  // Longest match from earliest start.
1057f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxOverlap=overlap;
1058f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1059f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1060f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
1061f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++inc;
1062f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1063f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                s8+=length8;
1064f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1065f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1066f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(maxInc!=0 || maxOverlap!=0) {
1067f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Longest-match algorithm, and there was a string match.
1068f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Simply continue after it.
1069f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos+=maxInc;
1070f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                rest-=maxInc;
1071f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(rest==0) {
1072f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return length;  // Reached the end of the string.
1073f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1074f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=0;  // Match strings from after a string match.
1075f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;
1076f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1077f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1078f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Finished trying to match all strings at pos.
1079f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1080f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanLength!=0 || pos==0) {
1081f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is after an unlimited code point span (spanLength!=0),
1082f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // not after a string match.
1083f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The only position where spanLength==0 after a span is pos==0.
1084f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Otherwise, an unlimited code point span is only tried again when no
1085f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // strings match, and if such a non-initial span fails we stop.
1086f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
1087f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // No strings matched after a span.
1088f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1089f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Match strings from after the next string match.
1090f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
1091f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is after a string match (or a single code point).
1092f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
1093f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // No more strings matched after a previous string match.
1094f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try another code point span from after the last string match.
1095f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1096f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if( spanLength==rest || // Reached the end of the string, or
1097f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength==0       // neither strings nor span progressed.
1098f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                ) {
1099f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return pos+spanLength;
1100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos+=spanLength;
1102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                rest-=spanLength;
1103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;  // spanLength>0: Match strings from after a span.
1104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            } else {
1105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match only one code point from after a string match if some
1106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // string matched beyond it, so that we try all possible positions
1107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // and don't overshoot.
1108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=spanOneUTF8(spanSet, s+pos, rest);
1109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(spanLength>0) {
1110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(spanLength==rest) {
1111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        return length;  // Reached the end of the string.
1112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match strings after this code point.
1114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // There cannot be any increments below it because UnicodeSet strings
1115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // contain multiple code points.
1116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    pos+=spanLength;
1117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    rest-=spanLength;
1118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    offsets.shift(spanLength);
1119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength=0;
1120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Match strings from after a single code point.
1121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Match strings from after the next string match.
1123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t minOffset=offsets.popMinimum();
1126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos+=minOffset;
1127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        rest-=minOffset;
1128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanLength=0;  // Match strings from after a string match.
1129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
1131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return spanNotBackUTF8(s, length);
1135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(pos==0) {
1138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return 0;
1139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t spanLength=length-pos;
1141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Consider strings; they may overlap with the span.
1143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    OffsetList offsets;
1144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(spanCondition==USET_SPAN_CONTAINED) {
1145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Use offset list to try all possibilities.
1146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        offsets.setMaxLength(maxLength8);
1147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
1149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanBackUTF8Lengths=spanLengths;
1150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
1151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanBackUTF8Lengths+=3*stringsLength;
1152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    for(;;) {
1154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const uint8_t *s8=utf8;
1155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t length8;
1156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanCondition==USET_SPAN_CONTAINED) {
1157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
1158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                length8=utf8Lengths[i];
1159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(length8==0) {
1160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // String not representable in UTF-8.
1161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanBackUTF8Lengths[i];
1163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap==ALL_CP_CONTAINED) {
1164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    s8+=length8;
1165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Irrelevant string.
1166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-(length8-overlap)..pos-length8.
1169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
1170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length8;
1171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // While contained: No point matching fully inside the code point span.
1172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    int32_t len1=0;
1173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    U8_FWD_1(s8, len1, overlap);
1174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap-=len1;  // Length of the string minus the first code point.
1175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
1177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
1178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
1181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(dec>pos) {
1182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the decrement is not listed already.
1185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match at code point boundaries. (The UTF-8 strings were converted
1186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // from UTF-16 and are guaranteed to be well-formed.)
1187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        !offsets.containsOffset(dec) &&
1189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        matches8(s+pos-dec, s8, length8)
1190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ) {
1191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        if(dec==pos) {
1192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                            return 0;  // Reached the start of the string.
1193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        }
1194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        offsets.addOffset(dec);
1195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(overlap==0) {
1197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
1200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++dec;
1201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                s8+=length8;
1203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else /* USET_SPAN_SIMPLE */ {
1205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            int32_t maxDec=0, maxOverlap=0;
1206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            for(i=0; i<stringsLength; ++i) {
1207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                length8=utf8Lengths[i];
1208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(length8==0) {
1209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // String not representable in UTF-8.
1210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t overlap=spanBackUTF8Lengths[i];
1212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // For longest match, we do need to try to match even an all-contained string
1213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // to find the match from the latest end.
1214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match this string at pos-(length8-overlap)..pos-length8.
1216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>=LONG_SPAN) {
1217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=length8;
1218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Longest match: Need to match fully inside the code point span
1219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // to find the match from the latest end.
1220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(overlap>spanLength) {
1222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    overlap=spanLength;
1223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                for(;;) {
1226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(dec>pos || overlap<maxOverlap) {
1227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Try to match if the string is longer or ends later.
1230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match at code point boundaries. (The UTF-8 strings were converted
1231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // from UTF-16 and are guaranteed to be well-formed.)
1232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        matches8(s+pos-dec, s8, length8)
1235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ) {
1236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxDec=dec;  // Longest match from latest end.
1237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        maxOverlap=overlap;
1238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        break;
1239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    --overlap;
1241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    ++dec;
1242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                s8+=length8;
1244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(maxDec!=0 || maxOverlap!=0) {
1247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Longest-match algorithm, and there was a string match.
1248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Simply continue before it.
1249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos-=maxDec;
1250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(pos==0) {
1251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return 0;  // Reached the start of the string.
1252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=0;  // Match strings from before a string match.
1254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;
1255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Finished trying to match all strings at pos.
1258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(spanLength!=0 || pos==length) {
1260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is before an unlimited code point span (spanLength!=0),
1261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // not before a string match.
1262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The only position where spanLength==0 before a span is pos==length.
1263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Otherwise, an unlimited code point span is only tried again when no
1264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // strings match, and if such a non-initial span fails we stop.
1265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
1266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // No strings matched before a span.
1267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Match strings from before the next string match.
1269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        } else {
1270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // The position is before a string match (or a single code point).
1271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(offsets.isEmpty()) {
1272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // No more strings matched before a previous string match.
1273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try another code point span from before the last string match.
1274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                int32_t oldPos=pos;
1275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=oldPos-pos;
1277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if( pos==0 ||           // Reached the start of the string, or
1278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength==0       // neither strings nor span progressed.
1279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                ) {
1280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    return pos;
1281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;  // spanLength>0: Match strings from before a span.
1283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            } else {
1284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Try to match only one code point from before a string match if some
1285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // string matched beyond it, so that we try all possible positions
1286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // and don't overshoot.
1287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                spanLength=spanOneBackUTF8(spanSet, s, pos);
1288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                if(spanLength>0) {
1289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    if(spanLength==pos) {
1290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                        return 0;  // Reached the start of the string.
1291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    }
1292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // Match strings before this code point.
1293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // There cannot be any decrements below it because UnicodeSet strings
1294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    // contain multiple code points.
1295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    pos-=spanLength;
1296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    offsets.shift(spanLength);
1297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    spanLength=0;
1298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                    continue;  // Match strings from before a single code point.
1299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                }
1300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                // Match strings from before the next string match.
1301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos-=offsets.popMinimum();
1304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanLength=0;  // Match strings from before a string match.
1305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
1307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
1309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
1311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Theoretical algorithm:
1312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Iterate through the string, and at each code point boundary:
1313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If the code point there is in the set, then return with the current position.
1314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If a set string matches at the current position, then return with the current position.
1315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
1316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Optimized implementation:
1317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
1318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Same assumption as for span() above.)
1319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
1320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Create and cache a spanNotSet which contains all of the single code points
1321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * of the original set but none of its strings.
1322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * For each set string add its initial code point to the spanNotSet.
1323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Also add its final code point for spanNotBack().)
1324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
1325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * - Loop:
1326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If the current code point is in the original set, then
1328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     return the current position.
1329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If any set string matches at the current position, then
1330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     return the current position.
1331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   + If there is no match at the current position, neither for the code point there
1332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     nor for any set string, then skip this code point and continue the loop.
1333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     This happens for set-string-initial code points that were added to spanNotSet
1334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *     when there is not actually a match for such a set string.
1335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */
1336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=0, rest=length;
1339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
1340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    do {
1341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Span until we find a code point from the set,
1342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // or a code point that starts or ends some string.
1343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(i==rest) {
1345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return length;  // Reached the end of the string.
1346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos+=i;
1348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        rest-=i;
1349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Check whether the current code point is in the original set,
1351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // without the string starts and ends.
1352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t cpLength=spanOne(spanSet, s+pos, rest);
1353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(cpLength>0) {
1354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return pos;  // There is a set element at pos.
1355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Try to match the strings at pos.
1358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        for(i=0; i<stringsLength; ++i) {
1359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(spanLengths[i]==ALL_CP_CONTAINED) {
1360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;  // Irrelevant string.
1361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            const UChar *s16=string.getBuffer();
1364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            int32_t length16=string.length();
1365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // There is a set element at pos.
1367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // The span(while not contained) ended on a string start/end which is
1371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // not in the original set. Skip this code point and continue.
1372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // cpLength<0
1373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos-=cpLength;
1374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        rest+=cpLength;
1375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } while(rest!=0);
1376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return length;  // Reached the end of the string.
1377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
1378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=length;
1381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
1382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    do {
1383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Span until we find a code point from the set,
1384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // or a code point that starts or ends some string.
1385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(pos==0) {
1387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return 0;  // Reached the start of the string.
1388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Check whether the current code point is in the original set,
1391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // without the string starts and ends.
1392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t cpLength=spanOneBack(spanSet, s, pos);
1393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(cpLength>0) {
1394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return pos;  // There is a set element at pos.
1395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Try to match the strings at pos.
1398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        for(i=0; i<stringsLength; ++i) {
1399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // Use spanLengths rather than a spanBackLengths pointer because
1400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // it is easier and we only need to know whether the string is irrelevant
1401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // which is the same in either array.
1402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(spanLengths[i]==ALL_CP_CONTAINED) {
1403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                continue;  // Irrelevant string.
1404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            const UChar *s16=string.getBuffer();
1407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            int32_t length16=string.length();
1408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // There is a set element at pos.
1410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // The span(while not contained) ended on a string start/end which is
1414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // not in the original set. Skip this code point and continue.
1415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // cpLength<0
1416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos+=cpLength;
1417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } while(pos!=0);
1418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return 0;  // Reached the start of the string.
1419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
1420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=0, rest=length;
1423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
1424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanUTF8Lengths=spanLengths;
1425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
1426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanUTF8Lengths+=2*stringsLength;
1427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    do {
1429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Span until we find a code point from the set,
1430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // or a code point that starts or ends some string.
1431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(i==rest) {
1433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return length;  // Reached the end of the string.
1434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos+=i;
1436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        rest-=i;
1437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Check whether the current code point is in the original set,
1439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // without the string starts and ends.
1440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(cpLength>0) {
1442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return pos;  // There is a set element at pos.
1443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Try to match the strings at pos.
1446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const uint8_t *s8=utf8;
1447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t length8;
1448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        for(i=0; i<stringsLength; ++i) {
1449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            length8=utf8Lengths[i];
1450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // ALL_CP_CONTAINED: Irrelevant string.
1451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // There is a set element at pos.
1453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            s8+=length8;
1455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // The span(while not contained) ended on a string start/end which is
1458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // not in the original set. Skip this code point and continue.
1459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // cpLength<0
1460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos-=cpLength;
1461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        rest+=cpLength;
1462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } while(rest!=0);
1463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return length;  // Reached the end of the string.
1464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
1465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t pos=length;
1468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    int32_t i, stringsLength=strings.size();
1469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    uint8_t *spanBackUTF8Lengths=spanLengths;
1470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if(all) {
1471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        spanBackUTF8Lengths+=3*stringsLength;
1472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
1473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    do {
1474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Span until we find a code point from the set,
1475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // or a code point that starts or ends some string.
1476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(pos==0) {
1478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return 0;  // Reached the start of the string.
1479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Check whether the current code point is in the original set,
1482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // without the string starts and ends.
1483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        if(cpLength>0) {
1485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            return pos;  // There is a set element at pos.
1486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // Try to match the strings at pos.
1489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        const uint8_t *s8=utf8;
1490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        int32_t length8;
1491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        for(i=0; i<stringsLength; ++i) {
1492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            length8=utf8Lengths[i];
1493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            // ALL_CP_CONTAINED: Irrelevant string.
1494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                return pos;  // There is a set element at pos.
1496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            }
1497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)            s8+=length8;
1498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        }
1499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // The span(while not contained) ended on a string start/end which is
1501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // not in the original set. Skip this code point and continue.
1502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        // cpLength<0
1503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pos+=cpLength;
1504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    } while(pos!=0);
1505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return 0;  // Reached the start of the string.
1506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
1507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
1508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_END
1509