16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ******************************************************************************* 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copyright (C) 1999-2001, International Business Machines 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Corporation and others. All Rights Reserved. 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ******************************************************************************* 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * file name: scrptrun.cpp 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * created on: 10/17/2001 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * created by: Eric R. Mader 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uscript.h" 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "scrptrun.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst char ScriptRun::fgClassID=0; 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUChar32 ScriptRun::pairedChars[] = { 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0028, 0x0029, // ascii paired punctuation 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x003c, 0x003e, 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x005b, 0x005d, 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x007b, 0x007d, 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x00ab, 0x00bb, // guillemets 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x2018, 0x2019, // general punctuation 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x201c, 0x201d, 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x2039, 0x203a, 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x3008, 0x3009, // chinese paired punctuation 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x300a, 0x300b, 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x300c, 0x300d, 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x300e, 0x300f, 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x3010, 0x3011, 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x3014, 0x3015, 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x3016, 0x3017, 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x3018, 0x3019, 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x301a, 0x301b 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst int32_t ScriptRun::pairedCharCount = ARRAY_SIZE(pairedChars); 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount); 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower; 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint8_t ScriptRun::highBit(int32_t value) 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (value <= 0) { 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -32; 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int8_t bit = 0; 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (value >= 1 << 16) { 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org value >>= 16; 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bit += 16; 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (value >= 1 << 8) { 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org value >>= 8; 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bit += 8; 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (value >= 1 << 4) { 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org value >>= 4; 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bit += 4; 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (value >= 1 << 2) { 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org value >>= 2; 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bit += 2; 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (value >= 1 << 1) { 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org value >>= 1; 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bit += 1; 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return bit; 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t ScriptRun::getPairIndex(UChar32 ch) 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t probe = pairedCharPower; 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t index = 0; 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (ch >= pairedChars[pairedCharExtra]) { 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org index = pairedCharExtra; 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (probe > (1 << 0)) { 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org probe >>= 1; 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (ch >= pairedChars[index + probe]) { 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org index += probe; 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (pairedChars[index] != ch) { 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org index = -1; 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return index; 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo) 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool ScriptRun::next() 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t startSP = parenSP; // used to find the first new open character 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode error = U_ZERO_ERROR; 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if we've fallen off the end of the text, we're done 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (scriptEnd >= charLimit) { 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return false; 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptCode = USCRIPT_COMMON; 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) { 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar high = charArray[scriptEnd]; 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 ch = high; 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if the character is a high surrogate and it's not the last one 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // in the text, see if it's followed by a low surrogate 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1) 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org { 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar low = charArray[scriptEnd + 1]; 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if it is followed by a low surrogate, 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // consume it and form the full character 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (low >= 0xDC00 && low <= 0xDFFF) { 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptEnd += 1; 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UScriptCode sc = uscript_getScript(ch, &error); 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pairIndex = getPairIndex(ch); 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Paired character handling: 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if it's an open character, push it onto the stack. 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if it's a close character, find the matching open on the 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // stack, and use that script code. Any non-matching open 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // characters above it on the stack will be poped. 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (pairIndex >= 0) { 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((pairIndex & 1) == 0) { 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org parenStack[++parenSP].pairIndex = pairIndex; 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org parenStack[parenSP].scriptCode = scriptCode; 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if (parenSP >= 0) { 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pi = pairIndex & ~1; 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) { 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org parenSP -= 1; 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (parenSP < startSP) { 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org startSP = parenSP; 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (parenSP >= 0) { 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org sc = parenStack[parenSP].scriptCode; 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (sameScript(scriptCode, sc)) { 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptCode = sc; 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // now that we have a final script code, fix any open 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // characters we pushed before we knew the script code. 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (startSP < parenSP) { 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org parenStack[++startSP].scriptCode = scriptCode; 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if this character is a close paired character, 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // pop it from the stack 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org parenSP -= 1; 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org startSP -= 1; 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if the run broke on a surrogate pair, 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // end it before the high surrogate 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (ch >= 0x10000) { 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptEnd -= 1; 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return true; 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 203