16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org******************************************************************************
36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Copyright (C) 2007-2012, International Business Machines
56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Corporation and others.  All Rights Reserved.
66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org******************************************************************************
86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   file name:  bmpset.cpp
96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   encoding:   US-ASCII
106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   tab size:   8 (not used)
116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   indentation:4
126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   created on: 2007jan29
146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   created by: Markus W. Scherer
156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h"
186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h"
196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf8.h"
206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf16.h"
216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h"
226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "bmpset.h"
236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uassert.h"
246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN
266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        list(parentList), listLength(parentListLength) {
296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_memset(table7FF, 0, sizeof(table7FF));
316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /*
346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Set the list indexes for binary searches for
356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * looked up in the bit tables.
386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * The last pair of indexes is for finding supplementary code points.
396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t i;
426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    for(i=1; i<=0x10; ++i) {
436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    list4kStarts[0x11]=listLength-1;
466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    initBits();
486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    overrideIllegal();
496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        list(newParentList), listLength(newParentListLength) {
536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::~BMPSet() {
606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Set bits in a bit rectangle in "vertical" bit organization.
646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * start<limit<=0x800
656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    U_ASSERT(start<limit);
686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    U_ASSERT(limit<=0x800);
696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Set one bit indicating an all-one block.
746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uint32_t bits=(uint32_t)1<<lead;
756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if((start+1)==limit) {  // Single-character shortcut.
766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        table[trail]|=bits;
776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t limitLead=limit>>6;
816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t limitTrail=limit&0x3f;
826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(lead==limitLead) {
846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Partial vertical bit column.
856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        while(trail<limitTrail) {
866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            table[trail++]|=bits;
876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Partial vertical bit column,
906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // followed by a bit rectangle,
916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // followed by another partial vertical bit column.
926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(trail>0) {
936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            do {
946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                table[trail++]|=bits;
956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } while(trail<64);
966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            ++lead;
976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(lead<limitLead) {
996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            bits=~((1<<lead)-1);
1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(limitLead<0x20) {
1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bits&=(1<<limitLead)-1;
1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            for(trail=0; trail<64; ++trail) {
1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                table[trail]|=bits;
1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // In that case, bits=1<<limitLead is undefined but the bits value
1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // is not used because trail<limitTrail is already false.
1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(trail=0; trail<limitTrail; ++trail) {
1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            table[trail]|=bits;
1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid BMPSet::initBits() {
1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar32 start, limit;
1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t listIndex=0;
1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Set asciiBytes[].
1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    do {
1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        start=list[listIndex++];
1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(listIndex<listLength) {
1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit=list[listIndex++];
1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit=0x110000;
1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(start>=0x80) {
1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        do {
1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            asciiBytes[start++]=1;
1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } while(start<limit && start<0x80);
1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } while(limit<=0x80);
1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Set table7FF[].
1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while(start<0x800) {
1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(limit>0x800) {
1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            start=0x800;
1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        start=list[listIndex++];
1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(listIndex<listLength) {
1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit=list[listIndex++];
1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit=0x110000;
1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Set bmpBlockBits[].
1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t minStart=0x800;
1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while(start<0x10000) {
1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(limit>0x10000) {
1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit=0x10000;
1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(start<minStart) {
1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            start=minStart;
1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(start&0x3f) {
1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // Mixed-value block of 64 code points.
1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                start>>=6;
1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                start=(start+1)<<6;  // Round up to the next block boundary.
1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                minStart=start;      // Ignore further ranges in this block.
1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(start<limit) {
1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(start<(limit&~0x3f)) {
1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // Multiple all-ones blocks of 64 code points each.
1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(limit&0x3f) {
1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // Mixed-value block of 64 code points.
1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    limit>>=6;
1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    limit=(limit+1)<<6;  // Round up to the next block boundary.
1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    minStart=limit;      // Ignore further ranges in this block.
1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(limit==0x10000) {
1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        start=list[listIndex++];
1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(listIndex<listLength) {
1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit=list[listIndex++];
1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit=0x110000;
1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Override some bits and bytes to the result of contains(FFFD)
2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * for faster validity checking at runtime.
2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * No need to set 0 values where they were reset to 0 in the constructor
2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and not modified by initBits().
2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Need to set 0 values for surrogates D800..DFFF.
2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid BMPSet::overrideIllegal() {
2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uint32_t bits, mask;
2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t i;
2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // contains(FFFD)==TRUE
2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(i=0x80; i<0xc0; ++i) {
2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            asciiBytes[i]=1;
2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        bits=3;                 // Lead bytes 0xC0 and 0xC1.
2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(i=0; i<64; ++i) {
2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            table7FF[i]|=bits;
2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        bits=1;                 // Lead byte 0xE0.
2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(i=0; i<32; ++i) {   // First half of 4k block.
2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            bmpBlockBits[i]|=bits;
2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        mask=~(0x10001<<0xd);   // Lead byte 0xED.
2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        bits=1<<0xd;
2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(i=32; i<64; ++i) {  // Second half of 4k block.
2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // contains(FFFD)==FALSE
2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        mask=~(0x10001<<0xd);   // Lead byte 0xED.
2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(i=32; i<64; ++i) {  // Second half of 4k block.
2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            bmpBlockBits[i]&=mask;
2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /* Examples:
2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                       findCodePoint(c)
2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       set              list[]         c=0 1 3 4 7 8
2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       ===              ==============   ===========
2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       []               [110000]         0 0 0 0 0 0
2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       [:Any:]          [0, 110000]      1 1 1 1 1 1
2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Return the smallest i such that c < list[i].  Assume
2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (c < list[lo])
2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return lo;
2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // High runner test.  c is often after the last range, so an
2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // initial check for this condition pays off.
2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (lo >= hi || c >= list[hi-1])
2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return hi;
2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // invariant: c >= list[lo]
2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // invariant: c < list[hi]
2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    for (;;) {
2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        int32_t i = (lo + hi) >> 1;
2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (i == lo) {
2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break; // Found!
2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else if (c < list[i]) {
2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            hi = i;
2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            lo = i;
2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return hi;
2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool
2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::contains(UChar32 c) const {
2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if((uint32_t)c<=0x7f) {
2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return (UBool)asciiBytes[c];
2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else if((uint32_t)c<=0x7ff) {
2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        int lead=c>>12;
2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(twoBits<=1) {
2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // All 64 code points with the same bits 15..6
2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // are either in the set or not.
2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            return (UBool)twoBits;
2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // Look up the code point in its 4k block of code points.
2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else if((uint32_t)c<=0x10ffff) {
2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // surrogate or supplementary code point
2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Out-of-range code points get FALSE, consistent with long-standing
2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // behavior of UnicodeSet::contains(c).
2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return FALSE;
3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Check for sufficient length for trail unit for each surrogate pair.
3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Handle single surrogates as surrogate code points as usual in ICU.
3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst UChar *
3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar c, c2;
3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(spanCondition) {
3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // span
3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        do {
3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            c=*s;
3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(c<=0x7f) {
3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(!asciiBytes[c]) {
3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<=0x7ff) {
3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<0xd800 || c>=0xe000) {
3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                int lead=c>>12;
3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(twoBits<=1) {
3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // All 64 code points with the same bits 15..6
3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // are either in the set or not.
3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(twoBits==0) {
3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } else {
3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // Look up the code point in its 4k block of code points.
3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate code point
3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate pair
3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                ++s;
3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } while(++s<limit);
3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // span not
3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        do {
3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            c=*s;
3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(c<=0x7f) {
3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(asciiBytes[c]) {
3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<=0x7ff) {
3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<0xd800 || c>=0xe000) {
3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                int lead=c>>12;
3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(twoBits<=1) {
3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // All 64 code points with the same bits 15..6
3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // are either in the set or not.
3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(twoBits!=0) {
3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } else {
3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // Look up the code point in its 4k block of code points.
3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate code point
3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate pair
3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                ++s;
3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } while(++s<limit);
3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return s;
3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* Symmetrical with span(). */
3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst UChar *
3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar c, c2;
3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(spanCondition) {
4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // span
4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(;;) {
4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            c=*(--limit);
4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(c<=0x7f) {
4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(!asciiBytes[c]) {
4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<=0x7ff) {
4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<0xd800 || c>=0xe000) {
4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                int lead=c>>12;
4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(twoBits<=1) {
4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // All 64 code points with the same bits 15..6
4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // are either in the set or not.
4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(twoBits==0) {
4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } else {
4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // Look up the code point in its 4k block of code points.
4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate code point
4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate pair
4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                --limit;
4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(s==limit) {
4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                return s;
4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // span not
4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(;;) {
4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            c=*(--limit);
4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(c<=0x7f) {
4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(asciiBytes[c]) {
4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<=0x7ff) {
4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<0xd800 || c>=0xe000) {
4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                int lead=c>>12;
4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(twoBits<=1) {
4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // All 64 code points with the same bits 15..6
4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // are either in the set or not.
4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(twoBits!=0) {
4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } else {
4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    // Look up the code point in its 4k block of code points.
4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        break;
4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate code point
4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // surrogate pair
4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    break;
4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                --limit;
4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(s==limit) {
4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                return s;
4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return limit+1;
4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Precheck for sufficient trail bytes at end of string only once per span.
4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Check validity.
4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst uint8_t *
4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const uint8_t *limit=s+length;
4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uint8_t b=*s;
4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if((int8_t)b>=0) {
4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Initial all-ASCII span.
5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(spanCondition) {
5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            do {
5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(!asciiBytes[b] || ++s==limit) {
5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    return s;
5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                b=*s;
5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } while((int8_t)b>=0);
5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            do {
5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(asciiBytes[b] || ++s==limit) {
5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    return s;
5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                b=*s;
5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } while((int8_t)b>=0);
5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        length=(int32_t)(limit-s);
5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const uint8_t *limit0=limit;
5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /*
5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * or runs into a lead byte.
5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * In the span loop compare s with limit only once
5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * per multi-byte character.
5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * including it if that is part of the span, otherwise set limit0 to before
5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * the truncated sequence.
5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    b=*(limit-1);
5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if((int8_t)b<0) {
5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // b>=0x80: lead or trail byte
5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(b<0xc0) {
5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // single trail byte, check for preceding 3- or 4-byte lead byte
5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(length>=2 && (b=*(limit-2))>=0xe0) {
5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                limit-=2;
5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(asciiBytes[0x80]!=spanCondition) {
5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    limit0=limit;
5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // 4-byte lead byte with only two trail bytes
5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                limit-=3;
5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(asciiBytes[0x80]!=spanCondition) {
5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    limit0=limit;
5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // lead byte with no trail bytes
5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            --limit;
5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(asciiBytes[0x80]!=spanCondition) {
5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                limit0=limit;
5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uint8_t t1, t2, t3;
5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while(s<limit) {
5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        b=*s;
5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(b<0xc0) {
5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // ASCII; or trail bytes with the result of contains(FFFD).
5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(spanCondition) {
5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                do {
5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(!asciiBytes[b]) {
5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return s;
5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    } else if(++s==limit) {
5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return limit0;
5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    b=*s;
5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } while(b<0xc0);
5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                do {
5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(asciiBytes[b]) {
5786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return s;
5796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    } else if(++s==limit) {
5806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return limit0;
5816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
5826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    b=*s;
5836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } while(b<0xc0);
5846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
5856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
5866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        ++s;  // Advance past the lead byte.
5876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(b>=0xe0) {
5886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(b<0xf0) {
5896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if( /* handle U+0000..U+FFFF inline */
5906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
5916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
5926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                ) {
5936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    b&=0xf;
5946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
5956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(twoBits<=1) {
5966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        // All 64 code points with this lead byte and middle trail byte
5976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        // are either in the set or not.
5986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        if(twoBits!=(uint32_t)spanCondition) {
5996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                            return s-1;
6006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        }
6016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    } else {
6026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        // Look up the code point in its 4k block of code points.
6036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        UChar32 c=(b<<12)|(t1<<6)|t2;
6046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
6056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                            return s-1;
6066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        }
6076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
6086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    s+=2;
6096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    continue;
6106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
6116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else if( /* handle U+10000..U+10FFFF inline */
6126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
6136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
6146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
6156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            ) {
6166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // Give an illegal sequence the same value as the result of contains(FFFD).
6176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
6186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if( (   (0x10000<=c && c<=0x10ffff) ?
6196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
6206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                            asciiBytes[0x80]
6216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    ) != spanCondition
6226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                ) {
6236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    return s-1;
6246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
6256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                s+=3;
6266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                continue;
6276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
6286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else /* 0xc0<=b<0xe0 */ {
6296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if( /* handle U+0000..U+07FF inline */
6306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                (t1=(uint8_t)(*s-0x80)) <= 0x3f
6316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            ) {
6326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
6336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    return s-1;
6346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
6356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                ++s;
6366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                continue;
6376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
6386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
6396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Give an illegal sequence the same value as the result of contains(FFFD).
6416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Handle each byte of an illegal sequence separately to simplify the code;
6426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // no need to optimize error handling.
6436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(asciiBytes[0x80]!=spanCondition) {
6446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            return s-1;
6456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
6466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
6476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return limit0;
6496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
6506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
6526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * While going backwards through UTF-8 optimize only for ASCII.
6536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
6546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * possible to tell from the last byte in a multi-byte sequence how many
6556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * preceding bytes there should be. Therefore, going backwards through UTF-8
6566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * is much harder than going forward.
6576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
6586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t
6596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
6606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
6616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
6626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
6636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uint8_t b;
6656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    do {
6676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        b=s[--length];
6686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if((int8_t)b>=0) {
6696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // ASCII sub-span
6706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(spanCondition) {
6716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                do {
6726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(!asciiBytes[b]) {
6736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return length+1;
6746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    } else if(length==0) {
6756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return 0;
6766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
6776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    b=s[--length];
6786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } while((int8_t)b>=0);
6796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
6806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                do {
6816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    if(asciiBytes[b]) {
6826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return length+1;
6836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    } else if(length==0) {
6846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        return 0;
6856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    }
6866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    b=s[--length];
6876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                } while((int8_t)b>=0);
6886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
6896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
6906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        int32_t prev=length;
6926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UChar32 c;
6936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // trail byte: collect a multi-byte character
6946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // (or  lead byte in last-trail position)
6956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
6966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // c is a valid code point, not ASCII, not a surrogate
6976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(c<=0x7ff) {
6986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
6996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                return prev+1;
7006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
7016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else if(c<=0xffff) {
7026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            int lead=c>>12;
7036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
7046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(twoBits<=1) {
7056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // All 64 code points with the same bits 15..6
7066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // are either in the set or not.
7076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(twoBits!=(uint32_t)spanCondition) {
7086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    return prev+1;
7096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
7106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
7116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // Look up the code point in its 4k block of code points.
7126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
7136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    return prev+1;
7146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
7156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
7166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } else {
7176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
7186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                return prev+1;
7196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
7206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
7216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } while(length>0);
7226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return 0;
7236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
7246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END
726