1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 */
11
12package com.ibm.icu.lang;
13
14import com.ibm.icu.text.UTF16;
15
16/**
17 * <code>UScriptRun</code> is used to find runs of characters in
18 * the same script, as defined in the <code>UScript</code> class.
19 * It implements a simple iterator over an array of characters.
20 * The iterator will assign <code>COMMON</code> and <code>INHERITED</code>
21 * characters to the same script as the preceeding characters. If the
22 * COMMON and INHERITED characters are first, they will be assigned to
23 * the same script as the following characters.
24 *
25 * The iterator will try to match paired punctuation. If it sees an
26 * opening punctuation character, it will remember the script that
27 * was assigned to that character, and assign the same script to the
28 * matching closing punctuation.
29 *
30 * No attempt is made to combine related scripts into a single run. In
31 * particular, Hiragana, Katakana, and Han characters will appear in separate
32 * runs.
33
34 * Here is an example of how to iterate over script runs:
35 * <pre>
36 * void printScriptRuns(char[] text)
37 * {
38 *     UScriptRun scriptRun = new UScriptRun(text);
39 *
40 *     while (scriptRun.next()) {
41 *         int start  = scriptRun.getScriptStart();
42 *         int limit  = scriptRun.getScriptLimit();
43 *         int script = scriptRun.getScriptCode();
44 *
45 *         System.out.println("Script \"" + UScript.getName(script) + "\" from " +
46 *                            start + " to " + limit + ".");
47 *     }
48 *  }
49 * </pre>
50 *
51 * @internal
52 * @deprecated This API is ICU internal only.
53 */
54@Deprecated
55public final class UScriptRun
56{
57    /**
58     * Construct an empty <code>UScriptRun</code> object. The <code>next()</code>
59     * method will return <code>false</code> the first time it is called.
60     *
61     * @internal
62     * @deprecated This API is ICU internal only.
63     */
64    @Deprecated
65    public UScriptRun()
66    {
67        char[] nullChars = null;
68
69        reset(nullChars, 0, 0);
70    }
71
72    /**
73     * Construct a <code>UScriptRun</code> object which iterates over the
74     * characters in the given string.
75     *
76     * @param text the string of characters over which to iterate.
77     *
78     * @internal
79     * @deprecated This API is ICU internal only.
80     */
81    @Deprecated
82    public UScriptRun(String text)
83    {
84        reset (text);
85    }
86
87    /**
88     * Construct a <code>UScriptRun</code> object which iterates over a subrange
89     * of the characetrs in the given string.
90     *
91     * @param text the string of characters over which to iterate.
92     * @param start the index of the first character over which to iterate
93     * @param count the number of characters over which to iterate
94     *
95     * @internal
96     * @deprecated This API is ICU internal only.
97     */
98    @Deprecated
99    public UScriptRun(String text, int start, int count)
100    {
101        reset(text, start, count);
102    }
103
104    /**
105     * Construct a <code>UScriptRun</code> object which iterates over the given
106     * characetrs.
107     *
108     * @param chars the array of characters over which to iterate.
109     *
110     * @internal
111     * @deprecated This API is ICU internal only.
112     */
113    @Deprecated
114    public UScriptRun(char[] chars)
115    {
116        reset(chars);
117    }
118
119    /**
120     * Construct a <code>UScriptRun</code> object which iterates over a subrange
121     * of the given characetrs.
122     *
123     * @param chars the array of characters over which to iterate.
124     * @param start the index of the first character over which to iterate
125     * @param count the number of characters over which to iterate
126     *
127     * @internal
128     * @deprecated This API is ICU internal only.
129     */
130    @Deprecated
131    public UScriptRun(char[] chars, int start, int count)
132    {
133        reset(chars, start, count);
134    }
135
136
137    /**
138     * Reset the iterator to the start of the text.
139     *
140     * @internal
141     * @deprecated This API is ICU internal only.
142     */
143    @Deprecated
144    public final void reset()
145    {
146        // empty any old parenStack contents.
147        // NOTE: this is not the most efficient way
148        // to do this, but it's the easiest to write...
149        while (stackIsNotEmpty()) {
150            pop();
151        }
152
153        scriptStart = textStart;
154        scriptLimit = textStart;
155        scriptCode  = UScript.INVALID_CODE;
156        parenSP     = -1;
157        pushCount   =  0;
158        fixupCount  =  0;
159
160        textIndex = textStart;
161    }
162
163    /**
164     * Reset the iterator to iterate over the given range of the text. Throws
165     * IllegalArgumentException if the range is outside of the bounds of the
166     * character array.
167     *
168     * @param start the index of the new first character over which to iterate
169     * @param count the new number of characters over which to iterate.
170     * @exception IllegalArgumentException If invalid arguments are passed.
171     *
172     * @internal
173     * @deprecated This API is ICU internal only.
174     */
175    @Deprecated
176    public final void reset(int start, int count)
177    throws IllegalArgumentException
178    {
179        int len = 0;
180
181        if (text != null) {
182            len = text.length;
183        }
184
185        if (start < 0 || count < 0 || start > len - count) {
186            throw new IllegalArgumentException();
187        }
188
189        textStart = start;
190        textLimit = start + count;
191
192        reset();
193    }
194
195    /**
196     * Reset the iterator to iterate over <code>count</code> characters
197     * in <code>chars</code> starting at <code>start</code>. This allows
198     * clients to reuse an iterator.
199     *
200     * @param chars the new array of characters over which to iterate.
201     * @param start the index of the first character over which to iterate.
202     * @param count the number of characters over which to iterate.
203     *
204     * @internal
205     * @deprecated This API is ICU internal only.
206     */
207    @Deprecated
208    public final void reset(char[] chars, int start, int count)
209    {
210        if (chars == null) {
211            chars = emptyCharArray;
212        }
213
214        text = chars;
215
216        reset(start, count);
217    }
218
219    /**
220     * Reset the iterator to iterate over the characters
221     * in <code>chars</code>. This allows clients to reuse an iterator.
222     *
223     * @param chars the new array of characters over which to iterate.
224     *
225     * @internal
226     * @deprecated This API is ICU internal only.
227     */
228    @Deprecated
229    public final void reset(char[] chars)
230    {
231        int length = 0;
232
233        if (chars != null) {
234            length = chars.length;
235        }
236
237        reset(chars, 0, length);
238    }
239
240    /**
241     * Reset the iterator to iterate over <code>count</code> characters
242     * in <code>text</code> starting at <code>start</code>. This allows
243     * clients to reuse an iterator.
244     *
245     * @param str the new string of characters over which to iterate.
246     * @param start the index of the first character over which to iterate.
247     * @param count the nuber of characters over which to iterate.
248     *
249     * @internal
250     * @deprecated This API is ICU internal only.
251     */
252    @Deprecated
253    public final void reset(String str, int start, int count)
254    {
255        char[] chars = null;
256
257        if (str != null) {
258            chars = str.toCharArray();
259        }
260
261        reset(chars, start, count);
262    }
263
264    /**
265     * Reset the iterator to iterate over the characters
266     * in <code>text</code>. This allows clients to reuse an iterator.
267     *
268     * @param str the new string of characters over which to iterate.
269     *
270     * @internal
271     * @deprecated This API is ICU internal only.
272     */
273    @Deprecated
274    public final void reset(String str)
275    {
276        int length   = 0;
277
278        if (str != null) {
279            length = str.length();
280        }
281
282        reset(str, 0, length);
283    }
284
285
286
287    /**
288     * Get the starting index of the current script run.
289     *
290     * @return the index of the first character in the current script run.
291     *
292     * @internal
293     * @deprecated This API is ICU internal only.
294     */
295    @Deprecated
296    public final int getScriptStart()
297    {
298        return scriptStart;
299    }
300
301    /**
302     * Get the index of the first character after the current script run.
303     *
304     * @return the index of the first character after the current script run.
305     *
306     * @internal
307     * @deprecated This API is ICU internal only.
308     */
309    @Deprecated
310    public final int getScriptLimit()
311    {
312        return scriptLimit;
313    }
314
315    /**
316     * Get the script code for the script of the current script run.
317     *
318     * @return the script code for the script of the current script run.
319     * @see com.ibm.icu.lang.UScript
320     *
321     * @internal
322     * @deprecated This API is ICU internal only.
323     */
324    @Deprecated
325    public final int getScriptCode()
326    {
327        return scriptCode;
328    }
329
330    /**
331     * Find the next script run. Returns <code>false</code> if there
332     * isn't another run, returns <code>true</code> if there is.
333     *
334     * @return <code>false</code> if there isn't another run, <code>true</code> if there is.
335     *
336     * @internal
337     * @deprecated This API is ICU internal only.
338     */
339    @Deprecated
340    public final boolean next()
341    {
342        // if we've fallen off the end of the text, we're done
343        if (scriptLimit >= textLimit) {
344            return false;
345        }
346
347        scriptCode  = UScript.COMMON;
348        scriptStart = scriptLimit;
349
350        syncFixup();
351
352        while (textIndex < textLimit) {
353            int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart);
354            int codePointCount = UTF16.getCharCount(ch);
355            int sc = UScript.getScript(ch);
356            int pairIndex = getPairIndex(ch);
357
358            textIndex += codePointCount;
359
360            // Paired character handling:
361            //
362            // if it's an open character, push it onto the stack.
363            // if it's a close character, find the matching open on the
364            // stack, and use that script code. Any non-matching open
365            // characters above it on the stack will be poped.
366            if (pairIndex >= 0) {
367                if ((pairIndex & 1) == 0) {
368                    push(pairIndex, scriptCode);
369                } else {
370                    int pi = pairIndex & ~1;
371
372                    while (stackIsNotEmpty() && top().pairIndex != pi) {
373                        pop();
374                    }
375
376                    if (stackIsNotEmpty()) {
377                        sc = top().scriptCode;
378                    }
379                }
380            }
381
382            if (sameScript(scriptCode, sc)) {
383                if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
384                    scriptCode = sc;
385
386                    fixup(scriptCode);
387                }
388
389                // if this character is a close paired character,
390                // pop the matching open character from the stack
391                if (pairIndex >= 0 && (pairIndex & 1) != 0) {
392                    pop();
393                }
394            } else {
395                // We've just seen the first character of
396                // the next run. Back over it so we'll see
397                // it again the next time.
398                textIndex -= codePointCount;
399                break;
400            }
401        }
402
403        scriptLimit = textIndex;
404        return true;
405    }
406
407    /**
408     * Compare two script codes to see if they are in the same script. If one script is
409     * a strong script, and the other is INHERITED or COMMON, it will compare equal.
410     *
411     * @param scriptOne one of the script codes.
412     * @param scriptTwo the other script code.
413     * @return <code>true</code> if the two scripts are the same.
414     * @see com.ibm.icu.lang.UScript
415     */
416    private static boolean sameScript(int scriptOne, int scriptTwo)
417    {
418        return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo;
419    }
420
421    /*
422     * An internal class which holds entries on the paren stack.
423     */
424    private static final class ParenStackEntry
425    {
426        int pairIndex;
427        int scriptCode;
428
429        public ParenStackEntry(int thePairIndex, int theScriptCode)
430        {
431            pairIndex  = thePairIndex;
432            scriptCode = theScriptCode;
433        }
434    }
435
436    private static final int mod(int sp)
437    {
438        return sp % PAREN_STACK_DEPTH;
439    }
440
441    private static final int inc(int sp, int count)
442    {
443        return mod(sp + count);
444    }
445
446    private static final int inc(int sp)
447    {
448        return inc(sp, 1);
449    }
450
451    private static final int dec(int sp, int count)
452    {
453        return mod(sp + PAREN_STACK_DEPTH - count);
454    }
455
456    private static final int dec(int sp)
457    {
458        return dec(sp, 1);
459    }
460
461    private static final int limitInc(int count)
462    {
463        if (count < PAREN_STACK_DEPTH) {
464            count += 1;
465        }
466
467        return count;
468    }
469
470    private final boolean stackIsEmpty()
471    {
472        return pushCount <= 0;
473    }
474
475    private final boolean stackIsNotEmpty()
476    {
477        return ! stackIsEmpty();
478    }
479
480    private final void push(int pairIndex, int scrptCode)
481    {
482        pushCount  = limitInc(pushCount);
483        fixupCount = limitInc(fixupCount);
484
485        parenSP = inc(parenSP);
486        parenStack[parenSP] = new ParenStackEntry(pairIndex, scrptCode);
487    }
488
489    private final void pop()
490    {
491
492        if (stackIsEmpty()) {
493            return;
494        }
495
496        parenStack[parenSP] = null;
497
498        if (fixupCount > 0) {
499            fixupCount -= 1;
500        }
501
502        pushCount -= 1;
503        parenSP = dec(parenSP);
504
505        // If the stack is now empty, reset the stack
506        // pointers to their initial values.
507        if (stackIsEmpty()) {
508            parenSP = -1;
509        }
510    }
511
512    private final ParenStackEntry top()
513    {
514        return parenStack[parenSP];
515    }
516
517    private final void syncFixup()
518    {
519        fixupCount = 0;
520    }
521
522    private final void fixup(int scrptCode)
523    {
524        int fixupSP = dec(parenSP, fixupCount);
525
526        while (fixupCount-- > 0) {
527            fixupSP = inc(fixupSP);
528            parenStack[fixupSP].scriptCode = scrptCode;
529        }
530    }
531
532    private char[] emptyCharArray = {};
533
534    private char[] text;
535
536    private int textIndex;
537    private int  textStart;
538    private int  textLimit;
539
540    private int  scriptStart;
541    private int  scriptLimit;
542    private int  scriptCode;
543
544    private static int PAREN_STACK_DEPTH = 32;
545    private static ParenStackEntry parenStack[] = new ParenStackEntry[PAREN_STACK_DEPTH];
546    private int parenSP = -1;
547    private int pushCount = 0;
548    private int fixupCount = 0;
549
550    /**
551     * Find the highest bit that's set in a word. Uses a binary search through
552     * the bits.
553     *
554     * @param n the word in which to find the highest bit that's set.
555     * @return the bit number (counting from the low order bit) of the highest bit.
556     */
557    private static final byte highBit(int n)
558    {
559        if (n <= 0) {
560            return -32;
561        }
562
563        byte bit = 0;
564
565        if (n >= 1 << 16) {
566            n >>= 16;
567            bit += 16;
568        }
569
570        if (n >= 1 << 8) {
571            n >>= 8;
572            bit += 8;
573        }
574
575        if (n >= 1 << 4) {
576            n >>= 4;
577            bit += 4;
578        }
579
580        if (n >= 1 << 2) {
581            n >>= 2;
582            bit += 2;
583        }
584
585        if (n >= 1 << 1) {
586            n >>= 1;
587            bit += 1;
588        }
589
590        return bit;
591    }
592
593    /**
594     * Search the pairedChars array for the given character.
595     *
596     * @param ch the character for which to search.
597     * @return the index of the character in the table, or -1 if it's not there.
598     */
599    private static int getPairIndex(int ch)
600    {
601        int probe = pairedCharPower;
602        int index = 0;
603
604        if (ch >= pairedChars[pairedCharExtra]) {
605            index = pairedCharExtra;
606        }
607
608        while (probe > (1 << 0)) {
609            probe >>= 1;
610
611            if (ch >= pairedChars[index + probe]) {
612                index += probe;
613            }
614        }
615
616        if (pairedChars[index] != ch) {
617            index = -1;
618        }
619
620        return index;
621    }
622
623    private static int pairedChars[] = {
624        0x0028, 0x0029, // ascii paired punctuation
625        0x003c, 0x003e,
626        0x005b, 0x005d,
627        0x007b, 0x007d,
628        0x00ab, 0x00bb, // guillemets
629        0x2018, 0x2019, // general punctuation
630        0x201c, 0x201d,
631        0x2039, 0x203a,
632        0x3008, 0x3009, // chinese paired punctuation
633        0x300a, 0x300b,
634        0x300c, 0x300d,
635        0x300e, 0x300f,
636        0x3010, 0x3011,
637        0x3014, 0x3015,
638        0x3016, 0x3017,
639        0x3018, 0x3019,
640        0x301a, 0x301b
641    };
642
643    private static int pairedCharPower = 1 << highBit(pairedChars.length);
644    private static int pairedCharExtra = pairedChars.length - pairedCharPower;
645}
646
647