1/*
2******************************************************************************
3*
4*   Copyright (C) 2007-2008, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  bmpset.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2007jan29
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/uniset.h"
19#include "cmemory.h"
20#include "bmpset.h"
21
22U_NAMESPACE_BEGIN
23
24BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
25        list(parentList), listLength(parentListLength) {
26    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
27    uprv_memset(table7FF, 0, sizeof(table7FF));
28    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
29
30    /*
31     * Set the list indexes for binary searches for
32     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
33     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
34     * looked up in the bit tables.
35     * The last pair of indexes is for finding supplementary code points.
36     */
37    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
38    int32_t i;
39    for(i=1; i<=0x10; ++i) {
40        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
41    }
42    list4kStarts[0x11]=listLength-1;
43
44    initBits();
45    overrideIllegal();
46}
47
48BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
49        list(newParentList), listLength(newParentListLength) {
50    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
51    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
52    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
53    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
54}
55
56BMPSet::~BMPSet() {
57}
58
59/*
60 * Set bits in a bit rectangle in "vertical" bit organization.
61 * start<limit<=0x800
62 */
63static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
64    int32_t lead=start>>6;
65    int32_t trail=start&0x3f;
66
67    // Set one bit indicating an all-one block.
68    uint32_t bits=(uint32_t)1<<lead;
69    if((start+1)==limit) {  // Single-character shortcut.
70        table[trail]|=bits;
71        return;
72    }
73
74    int32_t limitLead=limit>>6;
75    int32_t limitTrail=limit&0x3f;
76
77    if(lead==limitLead) {
78        // Partial vertical bit column.
79        while(trail<limitTrail) {
80            table[trail++]|=bits;
81        }
82    } else {
83        // Partial vertical bit column,
84        // followed by a bit rectangle,
85        // followed by another partial vertical bit column.
86        if(trail>0) {
87            do {
88                table[trail++]|=bits;
89            } while(trail<64);
90            ++lead;
91        }
92        if(lead<limitLead) {
93            bits=~((1<<lead)-1);
94            if(limitLead<0x20) {
95                bits&=(1<<limitLead)-1;
96            }
97            for(trail=0; trail<64; ++trail) {
98                table[trail]|=bits;
99            }
100        }
101        bits=1<<limitLead;
102        for(trail=0; trail<limitTrail; ++trail) {
103            table[trail]|=bits;
104        }
105    }
106}
107
108void BMPSet::initBits() {
109    UChar32 start, limit;
110    int32_t listIndex=0;
111
112    // Set asciiBytes[].
113    do {
114        start=list[listIndex++];
115        if(listIndex<listLength) {
116            limit=list[listIndex++];
117        } else {
118            limit=0x110000;
119        }
120        if(start>=0x80) {
121            break;
122        }
123        do {
124            asciiBytes[start++]=1;
125        } while(start<limit && start<0x80);
126    } while(limit<=0x80);
127
128    // Set table7FF[].
129    while(start<0x800) {
130        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
131        if(limit>0x800) {
132            start=0x800;
133            break;
134        }
135
136        start=list[listIndex++];
137        if(listIndex<listLength) {
138            limit=list[listIndex++];
139        } else {
140            limit=0x110000;
141        }
142    }
143
144    // Set bmpBlockBits[].
145    int32_t minStart=0x800;
146    while(start<0x10000) {
147        if(limit>0x10000) {
148            limit=0x10000;
149        }
150
151        if(start<minStart) {
152            start=minStart;
153        }
154        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
155            if(start&0x3f) {
156                // Mixed-value block of 64 code points.
157                start>>=6;
158                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
159                start=(start+1)<<6;  // Round up to the next block boundary.
160                minStart=start;      // Ignore further ranges in this block.
161            }
162            if(start<limit) {
163                if(start<(limit&~0x3f)) {
164                    // Multiple all-ones blocks of 64 code points each.
165                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
166                }
167
168                if(limit&0x3f) {
169                    // Mixed-value block of 64 code points.
170                    limit>>=6;
171                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
172                    limit=(limit+1)<<6;  // Round up to the next block boundary.
173                    minStart=limit;      // Ignore further ranges in this block.
174                }
175            }
176        }
177
178        if(limit==0x10000) {
179            break;
180        }
181
182        start=list[listIndex++];
183        if(listIndex<listLength) {
184            limit=list[listIndex++];
185        } else {
186            limit=0x110000;
187        }
188    }
189}
190
191/*
192 * Override some bits and bytes to the result of contains(FFFD)
193 * for faster validity checking at runtime.
194 * No need to set 0 values where they were reset to 0 in the constructor
195 * and not modified by initBits().
196 * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
197 * Need to set 0 values for surrogates D800..DFFF.
198 */
199void BMPSet::overrideIllegal() {
200    uint32_t bits, mask;
201    int32_t i;
202
203    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
204        // contains(FFFD)==TRUE
205        for(i=0x80; i<0xc0; ++i) {
206            asciiBytes[i]=1;
207        }
208
209        bits=3;                 // Lead bytes 0xC0 and 0xC1.
210        for(i=0; i<64; ++i) {
211            table7FF[i]|=bits;
212        }
213
214        bits=1;                 // Lead byte 0xE0.
215        for(i=0; i<32; ++i) {   // First half of 4k block.
216            bmpBlockBits[i]|=bits;
217        }
218
219        mask=~(0x10001<<0xd);   // Lead byte 0xED.
220        bits=1<<0xd;
221        for(i=32; i<64; ++i) {  // Second half of 4k block.
222            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
223        }
224    } else {
225        // contains(FFFD)==FALSE
226        mask=~(0x10001<<0xd);   // Lead byte 0xED.
227        for(i=32; i<64; ++i) {  // Second half of 4k block.
228            bmpBlockBits[i]&=mask;
229        }
230    }
231}
232
233int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
234    /* Examples:
235                                       findCodePoint(c)
236       set              list[]         c=0 1 3 4 7 8
237       ===              ==============   ===========
238       []               [110000]         0 0 0 0 0 0
239       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
240       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
241       [:Any:]          [0, 110000]      1 1 1 1 1 1
242     */
243
244    // Return the smallest i such that c < list[i].  Assume
245    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
246    if (c < list[lo])
247        return lo;
248    // High runner test.  c is often after the last range, so an
249    // initial check for this condition pays off.
250    if (lo >= hi || c >= list[hi-1])
251        return hi;
252    // invariant: c >= list[lo]
253    // invariant: c < list[hi]
254    for (;;) {
255        int32_t i = (lo + hi) >> 1;
256        if (i == lo) {
257            break; // Found!
258        } else if (c < list[i]) {
259            hi = i;
260        } else {
261            lo = i;
262        }
263    }
264    return hi;
265}
266
267UBool
268BMPSet::contains(UChar32 c) const {
269    if((uint32_t)c<=0x7f) {
270        return (UBool)asciiBytes[c];
271    } else if((uint32_t)c<=0x7ff) {
272        return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
273    } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
274        int lead=c>>12;
275        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
276        if(twoBits<=1) {
277            // All 64 code points with the same bits 15..6
278            // are either in the set or not.
279            return (UBool)twoBits;
280        } else {
281            // Look up the code point in its 4k block of code points.
282            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
283        }
284    } else if((uint32_t)c<=0x10ffff) {
285        // surrogate or supplementary code point
286        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
287    } else {
288        // Out-of-range code points get FALSE, consistent with long-standing
289        // behavior of UnicodeSet::contains(c).
290        return FALSE;
291    }
292}
293
294/*
295 * Check for sufficient length for trail unit for each surrogate pair.
296 * Handle single surrogates as surrogate code points as usual in ICU.
297 */
298const UChar *
299BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
300    UChar c, c2;
301
302    if(spanCondition) {
303        // span
304        do {
305            c=*s;
306            if(c<=0x7f) {
307                if(!asciiBytes[c]) {
308                    break;
309                }
310            } else if(c<=0x7ff) {
311                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
312                    break;
313                }
314            } else if(c<0xd800 || c>=0xe000) {
315                int lead=c>>12;
316                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
317                if(twoBits<=1) {
318                    // All 64 code points with the same bits 15..6
319                    // are either in the set or not.
320                    if(twoBits==0) {
321                        break;
322                    }
323                } else {
324                    // Look up the code point in its 4k block of code points.
325                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
326                        break;
327                    }
328                }
329            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
330                // surrogate code point
331                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
332                    break;
333                }
334            } else {
335                // surrogate pair
336                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
337                    break;
338                }
339                ++s;
340            }
341        } while(++s<limit);
342    } else {
343        // span not
344        do {
345            c=*s;
346            if(c<=0x7f) {
347                if(asciiBytes[c]) {
348                    break;
349                }
350            } else if(c<=0x7ff) {
351                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
352                    break;
353                }
354            } else if(c<0xd800 || c>=0xe000) {
355                int lead=c>>12;
356                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
357                if(twoBits<=1) {
358                    // All 64 code points with the same bits 15..6
359                    // are either in the set or not.
360                    if(twoBits!=0) {
361                        break;
362                    }
363                } else {
364                    // Look up the code point in its 4k block of code points.
365                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
366                        break;
367                    }
368                }
369            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
370                // surrogate code point
371                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
372                    break;
373                }
374            } else {
375                // surrogate pair
376                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
377                    break;
378                }
379                ++s;
380            }
381        } while(++s<limit);
382    }
383    return s;
384}
385
386/* Symmetrical with span(). */
387const UChar *
388BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
389    UChar c, c2;
390
391    if(spanCondition) {
392        // span
393        for(;;) {
394            c=*(--limit);
395            if(c<=0x7f) {
396                if(!asciiBytes[c]) {
397                    break;
398                }
399            } else if(c<=0x7ff) {
400                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
401                    break;
402                }
403            } else if(c<0xd800 || c>=0xe000) {
404                int lead=c>>12;
405                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
406                if(twoBits<=1) {
407                    // All 64 code points with the same bits 15..6
408                    // are either in the set or not.
409                    if(twoBits==0) {
410                        break;
411                    }
412                } else {
413                    // Look up the code point in its 4k block of code points.
414                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
415                        break;
416                    }
417                }
418            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
419                // surrogate code point
420                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
421                    break;
422                }
423            } else {
424                // surrogate pair
425                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
426                    break;
427                }
428                --limit;
429            }
430            if(s==limit) {
431                return s;
432            }
433        }
434    } else {
435        // span not
436        for(;;) {
437            c=*(--limit);
438            if(c<=0x7f) {
439                if(asciiBytes[c]) {
440                    break;
441                }
442            } else if(c<=0x7ff) {
443                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
444                    break;
445                }
446            } else if(c<0xd800 || c>=0xe000) {
447                int lead=c>>12;
448                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
449                if(twoBits<=1) {
450                    // All 64 code points with the same bits 15..6
451                    // are either in the set or not.
452                    if(twoBits!=0) {
453                        break;
454                    }
455                } else {
456                    // Look up the code point in its 4k block of code points.
457                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
458                        break;
459                    }
460                }
461            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
462                // surrogate code point
463                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
464                    break;
465                }
466            } else {
467                // surrogate pair
468                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
469                    break;
470                }
471                --limit;
472            }
473            if(s==limit) {
474                return s;
475            }
476        }
477    }
478    return limit+1;
479}
480
481/*
482 * Precheck for sufficient trail bytes at end of string only once per span.
483 * Check validity.
484 */
485const uint8_t *
486BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
487    const uint8_t *limit=s+length;
488    uint8_t b=*s;
489    if((int8_t)b>=0) {
490        // Initial all-ASCII span.
491        if(spanCondition) {
492            do {
493                if(!asciiBytes[b] || ++s==limit) {
494                    return s;
495                }
496                b=*s;
497            } while((int8_t)b>=0);
498        } else {
499            do {
500                if(asciiBytes[b] || ++s==limit) {
501                    return s;
502                }
503                b=*s;
504            } while((int8_t)b>=0);
505        }
506        length=(int32_t)(limit-s);
507    }
508
509    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
510        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
511    }
512
513    const uint8_t *limit0=limit;
514
515    /*
516     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
517     * or runs into a lead byte.
518     * In the span loop compare s with limit only once
519     * per multi-byte character.
520     *
521     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
522     * including it if that is part of the span, otherwise set limit0 to before
523     * the truncated sequence.
524     */
525    b=*(limit-1);
526    if((int8_t)b<0) {
527        // b>=0x80: lead or trail byte
528        if(b<0xc0) {
529            // single trail byte, check for preceding 3- or 4-byte lead byte
530            if(length>=2 && (b=*(limit-2))>=0xe0) {
531                limit-=2;
532                if(asciiBytes[0x80]!=spanCondition) {
533                    limit0=limit;
534                }
535            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
536                // 4-byte lead byte with only two trail bytes
537                limit-=3;
538                if(asciiBytes[0x80]!=spanCondition) {
539                    limit0=limit;
540                }
541            }
542        } else {
543            // lead byte with no trail bytes
544            --limit;
545            if(asciiBytes[0x80]!=spanCondition) {
546                limit0=limit;
547            }
548        }
549    }
550
551    uint8_t t1, t2, t3;
552
553    while(s<limit) {
554        b=*s;
555        if(b<0xc0) {
556            // ASCII; or trail bytes with the result of contains(FFFD).
557            if(spanCondition) {
558                do {
559                    if(!asciiBytes[b]) {
560                        return s;
561                    } else if(++s==limit) {
562                        return limit0;
563                    }
564                    b=*s;
565                } while(b<0xc0);
566            } else {
567                do {
568                    if(asciiBytes[b]) {
569                        return s;
570                    } else if(++s==limit) {
571                        return limit0;
572                    }
573                    b=*s;
574                } while(b<0xc0);
575            }
576        }
577        ++s;  // Advance past the lead byte.
578        if(b>=0xe0) {
579            if(b<0xf0) {
580                if( /* handle U+0000..U+FFFF inline */
581                    (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
582                    (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
583                ) {
584                    b&=0xf;
585                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
586                    if(twoBits<=1) {
587                        // All 64 code points with this lead byte and middle trail byte
588                        // are either in the set or not.
589                        if(twoBits!=(uint32_t)spanCondition) {
590                            return s-1;
591                        }
592                    } else {
593                        // Look up the code point in its 4k block of code points.
594                        UChar32 c=(b<<12)|(t1<<6)|t2;
595                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
596                            return s-1;
597                        }
598                    }
599                    s+=2;
600                    continue;
601                }
602            } else if( /* handle U+10000..U+10FFFF inline */
603                (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
604                (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
605                (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
606            ) {
607                // Give an illegal sequence the same value as the result of contains(FFFD).
608                UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
609                if( (   (0x10000<=c && c<=0x10ffff) ?
610                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
611                            asciiBytes[0x80]
612                    ) != spanCondition
613                ) {
614                    return s-1;
615                }
616                s+=3;
617                continue;
618            }
619        } else /* 0xc0<=b<0xe0 */ {
620            if( /* handle U+0000..U+07FF inline */
621                (t1=(uint8_t)(*s-0x80)) <= 0x3f
622            ) {
623                if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
624                    return s-1;
625                }
626                ++s;
627                continue;
628            }
629        }
630
631        // Give an illegal sequence the same value as the result of contains(FFFD).
632        // Handle each byte of an illegal sequence separately to simplify the code;
633        // no need to optimize error handling.
634        if(asciiBytes[0x80]!=spanCondition) {
635            return s-1;
636        }
637    }
638
639    return limit0;
640}
641
642/*
643 * While going backwards through UTF-8 optimize only for ASCII.
644 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
645 * possible to tell from the last byte in a multi-byte sequence how many
646 * preceding bytes there should be. Therefore, going backwards through UTF-8
647 * is much harder than going forward.
648 */
649int32_t
650BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
651    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
652        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
653    }
654
655    uint8_t b;
656
657    do {
658        b=s[--length];
659        if((int8_t)b>=0) {
660            // ASCII sub-span
661            if(spanCondition) {
662                do {
663                    if(!asciiBytes[b]) {
664                        return length+1;
665                    } else if(length==0) {
666                        return 0;
667                    }
668                    b=s[--length];
669                } while((int8_t)b>=0);
670            } else {
671                do {
672                    if(asciiBytes[b]) {
673                        return length+1;
674                    } else if(length==0) {
675                        return 0;
676                    }
677                    b=s[--length];
678                } while((int8_t)b>=0);
679            }
680        }
681
682        int32_t prev=length;
683        UChar32 c;
684        if(b<0xc0) {
685            // trail byte: collect a multi-byte character
686            c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
687            if(c<0) {
688                c=0xfffd;
689            }
690        } else {
691            // lead byte in last-trail position
692            c=0xfffd;
693        }
694        // c is a valid code point, not ASCII, not a surrogate
695        if(c<=0x7ff) {
696            if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
697                return prev+1;
698            }
699        } else if(c<=0xffff) {
700            int lead=c>>12;
701            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
702            if(twoBits<=1) {
703                // All 64 code points with the same bits 15..6
704                // are either in the set or not.
705                if(twoBits!=(uint32_t)spanCondition) {
706                    return prev+1;
707                }
708            } else {
709                // Look up the code point in its 4k block of code points.
710                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
711                    return prev+1;
712                }
713            }
714        } else {
715            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
716                return prev+1;
717            }
718        }
719    } while(length>0);
720    return 0;
721}
722
723U_NAMESPACE_END
724