1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Copyright (C) 1999-2001, International Business Machines
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Corporation and others.  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   file name:  scrptrun.cpp
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   created on: 10/17/2001
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   created by: Eric R. Mader
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uscript.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "scrptrun.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array  / sizeof array[0])
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst char ScriptRun::fgClassID=0;
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUChar32 ScriptRun::pairedChars[] = {
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x0028, 0x0029, // ascii paired punctuation
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x003c, 0x003e,
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x005b, 0x005d,
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x007b, 0x007d,
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x00ab, 0x00bb, // guillemets
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x2018, 0x2019, // general punctuation
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x201c, 0x201d,
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x2039, 0x203a,
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x3008, 0x3009, // chinese paired punctuation
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x300a, 0x300b,
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x300c, 0x300d,
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x300e, 0x300f,
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x3010, 0x3011,
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x3014, 0x3015,
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x3016, 0x3017,
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x3018, 0x3019,
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    0x301a, 0x301b
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst int32_t ScriptRun::pairedCharCount = ARRAY_SIZE(pairedChars);
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount);
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower;
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint8_t ScriptRun::highBit(int32_t value)
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (value <= 0) {
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return -32;
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int8_t bit = 0;
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (value >= 1 << 16) {
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value >>= 16;
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bit += 16;
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (value >= 1 << 8) {
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value >>= 8;
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bit += 8;
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (value >= 1 << 4) {
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value >>= 4;
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bit += 4;
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (value >= 1 << 2) {
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value >>= 2;
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bit += 2;
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (value >= 1 << 1) {
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        value >>= 1;
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        bit += 1;
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return bit;
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t ScriptRun::getPairIndex(UChar32 ch)
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t probe = pairedCharPower;
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t index = 0;
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (ch >= pairedChars[pairedCharExtra]) {
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        index = pairedCharExtra;
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while (probe > (1 << 0)) {
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        probe >>= 1;
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (ch >= pairedChars[index + probe]) {
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            index += probe;
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (pairedChars[index] != ch) {
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        index = -1;
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return index;
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo)
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ScriptRun::next()
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t startSP  = parenSP;  // used to find the first new open character
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode error = U_ZERO_ERROR;
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if we've fallen off the end of the text, we're done
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (scriptEnd >= charLimit) {
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return false;
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    scriptCode = USCRIPT_COMMON;
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) {
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar   high = charArray[scriptEnd];
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar32 ch   = high;
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // if the character is a high surrogate and it's not the last one
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // in the text, see if it's followed by a low surrogate
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1)
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        {
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            UChar low = charArray[scriptEnd + 1];
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // if it is followed by a low surrogate,
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // consume it and form the full character
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (low >= 0xDC00 && low <= 0xDFFF) {
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                scriptEnd += 1;
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UScriptCode sc = uscript_getScript(ch, &error);
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t pairIndex = getPairIndex(ch);
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Paired character handling:
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        //
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // if it's an open character, push it onto the stack.
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // if it's a close character, find the matching open on the
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // stack, and use that script code. Any non-matching open
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // characters above it on the stack will be poped.
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (pairIndex >= 0) {
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if ((pairIndex & 1) == 0) {
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                parenStack[++parenSP].pairIndex = pairIndex;
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                parenStack[parenSP].scriptCode  = scriptCode;
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if (parenSP >= 0) {
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t pi = pairIndex & ~1;
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    parenSP -= 1;
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (parenSP < startSP) {
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    startSP = parenSP;
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (parenSP >= 0) {
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    sc = parenStack[parenSP].scriptCode;
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (sameScript(scriptCode, sc)) {
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                scriptCode = sc;
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // now that we have a final script code, fix any open
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // characters we pushed before we knew the script code.
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                while (startSP < parenSP) {
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    parenStack[++startSP].scriptCode = scriptCode;
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // if this character is a close paired character,
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // pop it from the stack
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                parenSP -= 1;
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                startSP -= 1;
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // if the run broke on a surrogate pair,
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // end it before the high surrogate
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (ch >= 0x10000) {
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                scriptEnd -= 1;
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return true;
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
203