1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 1999-2001, International Business Machines 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * file name: scrptrun.cpp 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * created on: 10/17/2001 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * created by: Eric R. Mader 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uscript.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "scrptrun.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst char ScriptRun::fgClassID=0; 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUChar32 ScriptRun::pairedChars[] = { 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x0028, 0x0029, // ascii paired punctuation 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x003c, 0x003e, 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x005b, 0x005d, 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x007b, 0x007d, 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x00ab, 0x00bb, // guillemets 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x2018, 0x2019, // general punctuation 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x201c, 0x201d, 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x2039, 0x203a, 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x3008, 0x3009, // chinese paired punctuation 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x300a, 0x300b, 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x300c, 0x300d, 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x300e, 0x300f, 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x3010, 0x3011, 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x3014, 0x3015, 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x3016, 0x3017, 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x3018, 0x3019, 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x301a, 0x301b 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst int32_t ScriptRun::pairedCharCount = ARRAY_SIZE(pairedChars); 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount); 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower; 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint8_t ScriptRun::highBit(int32_t value) 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (value <= 0) { 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return -32; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int8_t bit = 0; 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (value >= 1 << 16) { 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value >>= 16; 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bit += 16; 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (value >= 1 << 8) { 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value >>= 8; 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bit += 8; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (value >= 1 << 4) { 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value >>= 4; 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bit += 4; 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (value >= 1 << 2) { 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value >>= 2; 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bit += 2; 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (value >= 1 << 1) { 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru value >>= 1; 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bit += 1; 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return bit; 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t ScriptRun::getPairIndex(UChar32 ch) 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t probe = pairedCharPower; 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t index = 0; 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ch >= pairedChars[pairedCharExtra]) { 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index = pairedCharExtra; 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (probe > (1 << 0)) { 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru probe >>= 1; 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ch >= pairedChars[index + probe]) { 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index += probe; 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (pairedChars[index] != ch) { 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index = -1; 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return index; 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo) 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ScriptRun::next() 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t startSP = parenSP; // used to find the first new open character 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode error = U_ZERO_ERROR; 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if we've fallen off the end of the text, we're done 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (scriptEnd >= charLimit) { 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return false; 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru scriptCode = USCRIPT_COMMON; 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) { 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar high = charArray[scriptEnd]; 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 ch = high; 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if the character is a high surrogate and it's not the last one 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // in the text, see if it's followed by a low surrogate 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1) 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar low = charArray[scriptEnd + 1]; 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if it is followed by a low surrogate, 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // consume it and form the full character 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (low >= 0xDC00 && low <= 0xDFFF) { 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru scriptEnd += 1; 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UScriptCode sc = uscript_getScript(ch, &error); 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pairIndex = getPairIndex(ch); 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Paired character handling: 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if it's an open character, push it onto the stack. 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if it's a close character, find the matching open on the 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // stack, and use that script code. Any non-matching open 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // characters above it on the stack will be poped. 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (pairIndex >= 0) { 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((pairIndex & 1) == 0) { 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru parenStack[++parenSP].pairIndex = pairIndex; 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru parenStack[parenSP].scriptCode = scriptCode; 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if (parenSP >= 0) { 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t pi = pairIndex & ~1; 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) { 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru parenSP -= 1; 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (parenSP < startSP) { 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startSP = parenSP; 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (parenSP >= 0) { 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru sc = parenStack[parenSP].scriptCode; 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (sameScript(scriptCode, sc)) { 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru scriptCode = sc; 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // now that we have a final script code, fix any open 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // characters we pushed before we knew the script code. 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (startSP < parenSP) { 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru parenStack[++startSP].scriptCode = scriptCode; 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if this character is a close paired character, 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // pop it from the stack 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru parenSP -= 1; 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startSP -= 1; 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if the run broke on a surrogate pair, 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // end it before the high surrogate 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ch >= 0x10000) { 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru scriptEnd -= 1; 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return true; 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 203