1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius*   Copyright (C) 2007-2012, International Business Machines
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  bmpset.cpp
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 2007jan29
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus W. Scherer
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
19103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf8.h"
20103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf16.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "bmpset.h"
23103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "uassert.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        list(parentList), listLength(parentListLength) {
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memset(table7FF, 0, sizeof(table7FF));
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /*
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Set the list indexes for binary searches for
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * looked up in the bit tables.
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * The last pair of indexes is for finding supplementary code points.
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i;
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i=1; i<=0x10; ++i) {
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    list4kStarts[0x11]=listLength-1;
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    initBits();
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    overrideIllegal();
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        list(newParentList), listLength(newParentListLength) {
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::~BMPSet() {
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Set bits in a bit rectangle in "vertical" bit organization.
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * start<limit<=0x800
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
67103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    U_ASSERT(start<limit);
68103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    U_ASSERT(limit<=0x800);
69103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius
70103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
71103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Set one bit indicating an all-one block.
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t bits=(uint32_t)1<<lead;
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((start+1)==limit) {  // Single-character shortcut.
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        table[trail]|=bits;
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t limitLead=limit>>6;
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t limitTrail=limit&0x3f;
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(lead==limitLead) {
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Partial vertical bit column.
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while(trail<limitTrail) {
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            table[trail++]|=bits;
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Partial vertical bit column,
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // followed by a bit rectangle,
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // followed by another partial vertical bit column.
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(trail>0) {
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            do {
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                table[trail++]|=bits;
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } while(trail<64);
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ++lead;
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(lead<limitLead) {
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bits=~((1<<lead)-1);
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(limitLead<0x20) {
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                bits&=(1<<limitLead)-1;
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for(trail=0; trail<64; ++trail) {
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                table[trail]|=bits;
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
107103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius        // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
108103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius        // In that case, bits=1<<limitLead is undefined but the bits value
109103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius        // is not used because trail<limitTrail is already false.
11054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(trail=0; trail<limitTrail; ++trail) {
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            table[trail]|=bits;
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid BMPSet::initBits() {
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 start, limit;
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t listIndex=0;
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Set asciiBytes[].
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=list[listIndex++];
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(listIndex<listLength) {
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            limit=list[listIndex++];
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            limit=0x110000;
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(start>=0x80) {
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        do {
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            asciiBytes[start++]=1;
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } while(start<limit && start<0x80);
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(limit<=0x80);
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Set table7FF[].
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while(start<0x800) {
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(limit>0x800) {
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            start=0x800;
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=list[listIndex++];
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(listIndex<listLength) {
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            limit=list[listIndex++];
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            limit=0x110000;
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Set bmpBlockBits[].
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t minStart=0x800;
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while(start<0x10000) {
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(limit>0x10000) {
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            limit=0x10000;
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(start<minStart) {
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            start=minStart;
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(start&0x3f) {
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Mixed-value block of 64 code points.
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start>>=6;
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start=(start+1)<<6;  // Round up to the next block boundary.
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                minStart=start;      // Ignore further ranges in this block.
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(start<limit) {
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(start<(limit&~0x3f)) {
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Multiple all-ones blocks of 64 code points each.
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(limit&0x3f) {
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Mixed-value block of 64 code points.
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    limit>>=6;
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    limit=(limit+1)<<6;  // Round up to the next block boundary.
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    minStart=limit;      // Ignore further ranges in this block.
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(limit==0x10000) {
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        start=list[listIndex++];
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(listIndex<listLength) {
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            limit=list[listIndex++];
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            limit=0x110000;
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Override some bits and bytes to the result of contains(FFFD)
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for faster validity checking at runtime.
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * No need to set 0 values where they were reset to 0 in the constructor
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and not modified by initBits().
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Need to set 0 values for surrogates D800..DFFF.
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid BMPSet::overrideIllegal() {
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t bits, mask;
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i;
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // contains(FFFD)==TRUE
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0x80; i<0xc0; ++i) {
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            asciiBytes[i]=1;
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bits=3;                 // Lead bytes 0xC0 and 0xC1.
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<64; ++i) {
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            table7FF[i]|=bits;
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bits=1;                 // Lead byte 0xE0.
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<32; ++i) {   // First half of 4k block.
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bmpBlockBits[i]|=bits;
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        mask=~(0x10001<<0xd);   // Lead byte 0xED.
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bits=1<<0xd;
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=32; i<64; ++i) {  // Second half of 4k block.
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // contains(FFFD)==FALSE
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        mask=~(0x10001<<0xd);   // Lead byte 0xED.
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=32; i<64; ++i) {  // Second half of 4k block.
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bmpBlockBits[i]&=mask;
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* Examples:
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                       findCodePoint(c)
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru       set              list[]         c=0 1 3 4 7 8
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru       ===              ==============   ===========
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru       []               [110000]         0 0 0 0 0 0
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru       [:Any:]          [0, 110000]      1 1 1 1 1 1
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Return the smallest i such that c < list[i].  Assume
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (c < list[lo])
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return lo;
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // High runner test.  c is often after the last range, so an
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // initial check for this condition pays off.
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (lo >= hi || c >= list[hi-1])
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return hi;
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // invariant: c >= list[lo]
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // invariant: c < list[hi]
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (;;) {
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t i = (lo + hi) >> 1;
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (i == lo) {
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break; // Found!
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if (c < list[i]) {
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            hi = i;
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            lo = i;
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return hi;
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::contains(UChar32 c) const {
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((uint32_t)c<=0x7f) {
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return (UBool)asciiBytes[c];
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if((uint32_t)c<=0x7ff) {
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int lead=c>>12;
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(twoBits<=1) {
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // All 64 code points with the same bits 15..6
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // are either in the set or not.
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return (UBool)twoBits;
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Look up the code point in its 4k block of code points.
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if((uint32_t)c<=0x10ffff) {
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // surrogate or supplementary code point
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Out-of-range code points get FALSE, consistent with long-standing
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // behavior of UnicodeSet::contains(c).
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return FALSE;
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Check for sufficient length for trail unit for each surrogate pair.
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Handle single surrogates as surrogate code points as usual in ICU.
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar c, c2;
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition) {
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // span
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        do {
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c=*s;
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(c<=0x7f) {
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(!asciiBytes[c]) {
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<=0x7ff) {
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<0xd800 || c>=0xe000) {
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int lead=c>>12;
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(twoBits<=1) {
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // All 64 code points with the same bits 15..6
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // are either in the set or not.
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(twoBits==0) {
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Look up the code point in its 4k block of code points.
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate code point
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate pair
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ++s;
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } while(++s<limit);
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // span not
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        do {
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c=*s;
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(c<=0x7f) {
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(asciiBytes[c]) {
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<=0x7ff) {
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<0xd800 || c>=0xe000) {
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int lead=c>>12;
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(twoBits<=1) {
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // All 64 code points with the same bits 15..6
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // are either in the set or not.
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(twoBits!=0) {
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Look up the code point in its 4k block of code points.
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate code point
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate pair
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ++s;
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } while(++s<limit);
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return s;
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* Symmetrical with span(). */
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar c, c2;
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition) {
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // span
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(;;) {
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c=*(--limit);
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(c<=0x7f) {
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(!asciiBytes[c]) {
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<=0x7ff) {
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<0xd800 || c>=0xe000) {
413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int lead=c>>12;
414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(twoBits<=1) {
416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // All 64 code points with the same bits 15..6
417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // are either in the set or not.
418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(twoBits==0) {
419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Look up the code point in its 4k block of code points.
423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate code point
429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate pair
434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                --limit;
438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(s==limit) {
440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return s;
441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // span not
445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(;;) {
446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c=*(--limit);
447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(c<=0x7f) {
448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(asciiBytes[c]) {
449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<=0x7ff) {
452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<0xd800 || c>=0xe000) {
456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int lead=c>>12;
457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(twoBits<=1) {
459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // All 64 code points with the same bits 15..6
460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // are either in the set or not.
461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(twoBits!=0) {
462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // Look up the code point in its 4k block of code points.
466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate code point
472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // surrogate pair
477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                --limit;
481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(s==limit) {
483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return s;
484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return limit+1;
488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Precheck for sufficient trail bytes at end of string only once per span.
492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Check validity.
493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst uint8_t *
495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const uint8_t *limit=s+length;
497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t b=*s;
498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((int8_t)b>=0) {
499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Initial all-ASCII span.
500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(spanCondition) {
501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            do {
502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(!asciiBytes[b] || ++s==limit) {
503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return s;
504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                b=*s;
506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } while((int8_t)b>=0);
507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            do {
509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(asciiBytes[b] || ++s==limit) {
510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return s;
511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                b=*s;
513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } while((int8_t)b>=0);
514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        length=(int32_t)(limit-s);
516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const uint8_t *limit0=limit;
523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /*
525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * or runs into a lead byte.
527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * In the span loop compare s with limit only once
528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * per multi-byte character.
529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     *
530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * including it if that is part of the span, otherwise set limit0 to before
532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * the truncated sequence.
533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    b=*(limit-1);
535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if((int8_t)b<0) {
536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // b>=0x80: lead or trail byte
537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(b<0xc0) {
538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // single trail byte, check for preceding 3- or 4-byte lead byte
539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(length>=2 && (b=*(limit-2))>=0xe0) {
540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                limit-=2;
541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(asciiBytes[0x80]!=spanCondition) {
542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    limit0=limit;
543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // 4-byte lead byte with only two trail bytes
546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                limit-=3;
547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(asciiBytes[0x80]!=spanCondition) {
548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    limit0=limit;
549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // lead byte with no trail bytes
553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            --limit;
554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(asciiBytes[0x80]!=spanCondition) {
555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                limit0=limit;
556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t t1, t2, t3;
561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while(s<limit) {
563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        b=*s;
564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(b<0xc0) {
565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // ASCII; or trail bytes with the result of contains(FFFD).
566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(spanCondition) {
567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                do {
568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!asciiBytes[b]) {
569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return s;
570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } else if(++s==limit) {
571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return limit0;
572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    b=*s;
574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } while(b<0xc0);
575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                do {
577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(asciiBytes[b]) {
578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return s;
579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } else if(++s==limit) {
580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return limit0;
581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    b=*s;
583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } while(b<0xc0);
584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ++s;  // Advance past the lead byte.
587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(b>=0xe0) {
588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(b<0xf0) {
589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( /* handle U+0000..U+FFFF inline */
590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    b&=0xf;
594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(twoBits<=1) {
596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // All 64 code points with this lead byte and middle trail byte
597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // are either in the set or not.
598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(twoBits!=(uint32_t)spanCondition) {
599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return s-1;
600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } else {
602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Look up the code point in its 4k block of code points.
603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        UChar32 c=(b<<12)|(t1<<6)|t2;
604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            return s-1;
606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    s+=2;
609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    continue;
610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if( /* handle U+10000..U+10FFFF inline */
612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ) {
616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Give an illegal sequence the same value as the result of contains(FFFD).
617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if( (   (0x10000<=c && c<=0x10ffff) ?
619ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            asciiBytes[0x80]
621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ) != spanCondition
622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ) {
623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return s-1;
624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                s+=3;
626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else /* 0xc0<=b<0xe0 */ {
629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if( /* handle U+0000..U+07FF inline */
630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                (t1=(uint8_t)(*s-0x80)) <= 0x3f
631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ) {
63285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return s-1;
634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ++s;
636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                continue;
637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Give an illegal sequence the same value as the result of contains(FFFD).
641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Handle each byte of an illegal sequence separately to simplify the code;
642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // no need to optimize error handling.
643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(asciiBytes[0x80]!=spanCondition) {
644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return s-1;
645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return limit0;
649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * While going backwards through UTF-8 optimize only for ASCII.
653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * possible to tell from the last byte in a multi-byte sequence how many
655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * preceding bytes there should be. Therefore, going backwards through UTF-8
656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * is much harder than going forward.
657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint8_t b;
665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        b=s[--length];
668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((int8_t)b>=0) {
669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // ASCII sub-span
670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(spanCondition) {
671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                do {
672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(!asciiBytes[b]) {
673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return length+1;
674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } else if(length==0) {
675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return 0;
676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    b=s[--length];
678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } while((int8_t)b>=0);
679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                do {
681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if(asciiBytes[b]) {
682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return length+1;
683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } else if(length==0) {
684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return 0;
685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    b=s[--length];
687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } while((int8_t)b>=0);
688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t prev=length;
692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar32 c;
6938393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius        // trail byte: collect a multi-byte character
6948393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius        // (or  lead byte in last-trail position)
6958393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // c is a valid code point, not ASCII, not a surrogate
697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(c<=0x7ff) {
69885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return prev+1;
700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if(c<=0xffff) {
702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int lead=c>>12;
703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(twoBits<=1) {
705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // All 64 code points with the same bits 15..6
706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // are either in the set or not.
707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(twoBits!=(uint32_t)spanCondition) {
708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return prev+1;
709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Look up the code point in its 4k block of code points.
712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return prev+1;
714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return prev+1;
719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } while(length>0);
722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;
723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
726