10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/* 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 5fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2012-2014, International Business Machines 6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others. All Rights Reserved. 7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* utf8collationiterator.cpp 9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp) 11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer 12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/ 13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h" 15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION 17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utf8.h" 19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "charstr.h" 20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "cmemory.h" 21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collation.h" 22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdata.h" 23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationfcd.h" 24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationiterator.h" 25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "normalizer2impl.h" 26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "uassert.h" 27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "utf8collationiterator.h" 28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN 30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::~UTF8CollationIterator() {} 32fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 33fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 34fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::resetToOffset(int32_t newOffset) { 35fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius reset(); 36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = newOffset; 37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 38fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 39fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::getOffset() const { 41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return pos; 42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 44fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == length) { 47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = U_SENTINEL; 48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::FALLBACK_CE32; 49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32(). 51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = u8[pos++]; 52ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(U8_IS_SINGLE(c)) { 53ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // ASCII 00..7F 54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return trie->data32[c]; 55fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 56fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t t1, t2; 57ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(0xe0 <= c && c < 0xf0 && 58ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert ((pos + 1) < length || length < 0) && 59ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && 60ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { 61ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // U+0800..U+FFFF except surrogates 62ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); 63ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert pos += 2; 64ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 65ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 66ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // U+0080..U+07FF 67fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = ((c & 0x1f) << 6) | t1; 69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++pos; 70fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 72fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Function call for supplementary code points and error cases. 73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Illegal byte sequences yield U+FFFD. 74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return data->getCE32(c); 76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::foundNULTerminator() { 81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(length < 0) { 82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius length = --pos; 83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 89fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::forbidSurrogateCodePoints() const { 91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUChar32 95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == length) { 97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 99fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(u8[pos] == 0 && length < 0) { 100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius length = pos; 101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c; 104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_NEXT_OR_FFFD(u8, pos, length, c); 105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUChar32 109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == 0) { 111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c; 114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_PREV_OR_FFFD(u8, 0, pos, c); 115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_FWD_N(u8, pos, length, num); 121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_BACK_N(u8, 0, pos, num); 126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// FCDUTF8CollationIterator ------------------------------------------------ *** 129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::~FCDUTF8CollationIterator() {} 131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::resetToOffset(int32_t newOffset) { 134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius reset(); 135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = pos = newOffset; 136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = CHECK_FWD; 137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::getOffset() const { 141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state != IN_NORMALIZED) { 142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return pos; 143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(pos == 0) { 144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return start; 145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return limit; 147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == CHECK_FWD) { 154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath. 155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == length) { 156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = U_SENTINEL; 157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::FALLBACK_CE32; 158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = u8[pos++]; 160ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(U8_IS_SINGLE(c)) { 161ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // ASCII 00..7F 162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return trie->data32[c]; 163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t t1, t2; 165ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(0xe0 <= c && c < 0xf0 && 166ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert ((pos + 1) < length || length < 0) && 167ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && 168ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { 169ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // U+0800..U+FFFF except surrogates 170ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); 171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos += 2; 172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationFCD::hasTccc(c) && 173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (CollationFCD::maybeTibetanCompositeVowel(c) || 174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (pos != length && nextHasLccc()))) { 175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos -= 3; 176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; // return CE32(BMP) 178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 179ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 180ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert // U+0080..U+07FF 181ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 182ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert c = ((c & 0x1f) << 6) | t1; 183ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert ++pos; 184ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { 185ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert pos -= 2; 186ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } else { 187ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert return ce32; 188ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert } 189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Function call for supplementary code points and error cases. 191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Illegal byte sequences yield U+FFFD. 192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(c == 0xfffd) { 194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::FFFD_CE32; 195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(c > 0xffff); 197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) { 198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos -= 4; 199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return data->getCE32FromSupplementary(c); 201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!nextSegment(errorCode)) { 205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = U_SENTINEL; 206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::FALLBACK_CE32; 207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(state == IN_FCD_SEGMENT && pos != limit) { 210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return UTF8CollationIterator::handleNextCE32(c, errorCode); 211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(state == IN_NORMALIZED && pos != normalized.length()) { 212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = normalized[pos++]; 213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switchToForward(); 216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::nextHasLccc() const { 223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(state == CHECK_FWD && pos != length); 224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8. 225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.) 226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = u8[pos]; 227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; } 228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = pos; 229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_NEXT_OR_FFFD(u8, i, length, c); 230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(c > 0xffff) { c = U16_LEAD(c); } 231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return CollationFCD::hasLccc(c); 232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::previousHasTccc() const { 236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(state == CHECK_BWD && pos != 0); 237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = u8[pos - 1]; 238ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(U8_IS_SINGLE(c)) { return FALSE; } 239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = pos; 240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_PREV_OR_FFFD(u8, 0, i, c); 241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(c > 0xffff) { c = U16_LEAD(c); } 242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return CollationFCD::hasTccc(c); 243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUChar 246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::handleGetTrailSurrogate() { 247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state != IN_NORMALIZED) { return 0; } 248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(pos < normalized.length()); 249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar trail; 250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } 251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return trail; 252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::foundNULTerminator() { 256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == CHECK_FWD && length < 0) { 257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius length = --pos; 258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUChar32 265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { 266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c; 267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == CHECK_FWD) { 269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { 270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 272ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(U8_IS_SINGLE(c)) { 273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++pos; 274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_NEXT_OR_FFFD(u8, pos, length, c); 277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) && 278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (CollationFCD::maybeTibetanCompositeVowel(c) || 279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (pos != length && nextHasLccc()))) { 280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and we can use U8_LENGTH() rather than a previous-position variable. 282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos -= U8_LENGTH(c); 283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!nextSegment(errorCode)) { 284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(state == IN_FCD_SEGMENT && pos != limit) { 290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_NEXT_OR_FFFD(u8, pos, length, c); 291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(state == IN_NORMALIZED && pos != normalized.length()) { 293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = normalized.char32At(pos); 294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos += U16_LENGTH(c); 295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switchToForward(); 298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUChar32 303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { 304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c; 305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == CHECK_BWD) { 307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == 0) { 308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 310ffdc27edd5503111189fc11165c5a11289a71f79Fredrik Roubert if(U8_IS_SINGLE(c = u8[pos - 1])) { 311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius --pos; 312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_PREV_OR_FFFD(u8, 0, pos, c); 315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) && 316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (CollationFCD::maybeTibetanCompositeVowel(c) || 317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (pos != 0 && previousHasTccc()))) { 318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and we can use U8_LENGTH() rather than a previous-position variable. 320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos += U8_LENGTH(c); 321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!previousSegment(errorCode)) { 322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(state == IN_FCD_SEGMENT && pos != start) { 328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_PREV_OR_FFFD(u8, 0, pos, c); 329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(state >= IN_NORMALIZED && pos != 0) { 331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = normalized.char32At(pos - 1); 332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos -= U16_LENGTH(c); 333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switchToBackward(); 336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Specify the class to avoid a virtual-function indirection. 343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // In Java, we would declare this class final. 344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) { 345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius --num; 346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Specify the class to avoid a virtual-function indirection. 352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // In Java, we would declare this class final. 353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) { 354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius --num; 355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::switchToForward() { 360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(state == CHECK_BWD || 361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (state == IN_FCD_SEGMENT && pos == limit) || 362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (state == IN_NORMALIZED && pos == normalized.length())); 363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == CHECK_BWD) { 364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Turn around from backward checking. 365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = pos; 366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == limit) { 367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = CHECK_FWD; // Check forward. 368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { // pos < limit 369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = IN_FCD_SEGMENT; // Stay in FCD segment. 370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Reached the end of the FCD segment. 373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == IN_FCD_SEGMENT) { 374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The input text segment is FCD, extend it forward. 375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The input text segment needed to be normalized. 377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Switch to checking forward from it. 378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = pos = limit; 379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = CHECK_FWD; 381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) { 386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return FALSE; } 387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(state == CHECK_FWD && pos != length); 388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The input text [start..pos[ passes the FCD check. 389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t segmentStart = pos; 390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Collect the characters being checked, in case they need to be normalized. 391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString s; 392fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t prevCC = 0; 393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Fetch the next character and its fcd16 value. 395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t cpStart = pos; 396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c; 397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_NEXT_OR_FFFD(u8, pos, length, c); 398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint16_t fcd16 = nfcImpl.getFCD16(c); 399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t leadCC = (uint8_t)(fcd16 >> 8); 400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(leadCC == 0 && cpStart != segmentStart) { 401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // FCD boundary before this character. 402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = cpStart; 403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s.append(c); 406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Fails FCD check. Find the next FCD boundary and normalize. 408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(pos != length) { 409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cpStart = pos; 410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_NEXT_OR_FFFD(u8, pos, length, c); 411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfcImpl.getFCD16(c) <= 0xff) { 412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = cpStart; 413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s.append(c); 416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!normalize(s, errorCode)) { return FALSE; } 418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = segmentStart; 419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius limit = pos; 420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = IN_NORMALIZED; 421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = 0; 422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius prevCC = (uint8_t)fcd16; 425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == length || prevCC == 0) { 426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // FCD boundary after the last character. 427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius limit = pos; 431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = segmentStart; 432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(pos != limit); 433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = IN_FCD_SEGMENT; 434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::switchToBackward() { 439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(state == CHECK_FWD || 440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (state == IN_FCD_SEGMENT && pos == start) || 441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (state >= IN_NORMALIZED && pos == 0)); 442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == CHECK_FWD) { 443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Turn around from forward checking. 444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius limit = pos; 445fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == start) { 446fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = CHECK_BWD; // Check backward. 447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { // pos > start 448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = IN_FCD_SEGMENT; // Stay in FCD segment. 449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Reached the start of the FCD segment. 452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(state == IN_FCD_SEGMENT) { 453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The input text segment is FCD, extend it backward. 454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The input text segment needed to be normalized. 456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Switch to checking backward from it. 457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius limit = pos = start; 458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = CHECK_BWD; 460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) { 465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return FALSE; } 466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(state == CHECK_BWD && pos != 0); 467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The input text [pos..limit[ passes the FCD check. 468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t segmentLimit = pos; 469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Collect the characters being checked, in case they need to be normalized. 470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString s; 471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t nextCC = 0; 472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Fetch the previous character and its fcd16 value. 474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t cpLimit = pos; 475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c; 476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_PREV_OR_FFFD(u8, 0, pos, c); 477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint16_t fcd16 = nfcImpl.getFCD16(c); 478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t trailCC = (uint8_t)fcd16; 479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(trailCC == 0 && cpLimit != segmentLimit) { 480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // FCD boundary after this character. 481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = cpLimit; 482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s.append(c); 485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Fails FCD check. Find the previous FCD boundary and normalize. 488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(fcd16 > 0xff && pos != 0) { 489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cpLimit = pos; 490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U8_PREV_OR_FFFD(u8, 0, pos, c); 491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fcd16 = nfcImpl.getFCD16(c); 492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fcd16 == 0) { 493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = cpLimit; 494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s.append(c); 497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s.reverse(); 499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!normalize(s, errorCode)) { return FALSE; } 500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius limit = segmentLimit; 501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = pos; 502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = IN_NORMALIZED; 503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = normalized.length(); 504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nextCC = (uint8_t)(fcd16 >> 8); 507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == 0 || nextCC == 0) { 508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // FCD boundary before the following character. 509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = pos; 513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = segmentLimit; 514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(pos != start); 515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius state = IN_FCD_SEGMENT; 516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { 521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // NFD without argument checking. 522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(U_SUCCESS(errorCode)); 523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfcImpl.decompose(s, normalized, errorCode); 524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SUCCESS(errorCode); 525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END 528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // !UCONFIG_NO_COLLATION 530