1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.impl;
10
11import java.io.IOException;
12import java.util.ArrayList;
13import java.util.Locale;
14import java.util.regex.Pattern;
15
16import com.ibm.icu.lang.UCharacter;
17import com.ibm.icu.text.Replaceable;
18import com.ibm.icu.text.UTF16;
19import com.ibm.icu.text.UnicodeMatcher;
20
21public final class Utility {
22
23    private static final char APOSTROPHE = '\'';
24    private static final char BACKSLASH  = '\\';
25    private static final int MAGIC_UNSIGNED = 0x80000000;
26
27    /**
28     * Convenience utility to compare two Object[]s.
29     * Ought to be in System
30     */
31    public final static boolean arrayEquals(Object[] source, Object target) {
32        if (source == null) return (target == null);
33        if (!(target instanceof Object[])) return false;
34        Object[] targ = (Object[]) target;
35        return (source.length == targ.length
36                && arrayRegionMatches(source, 0, targ, 0, source.length));
37    }
38
39    /**
40     * Convenience utility to compare two int[]s
41     * Ought to be in System
42     */
43    public final static boolean arrayEquals(int[] source, Object target) {
44        if (source == null) return (target == null);
45        if (!(target instanceof int[])) return false;
46        int[] targ = (int[]) target;
47        return (source.length == targ.length
48                && arrayRegionMatches(source, 0, targ, 0, source.length));
49    }
50
51    /**
52     * Convenience utility to compare two double[]s
53     * Ought to be in System
54     */
55    public final static boolean arrayEquals(double[] source, Object target) {
56        if (source == null) return (target == null);
57        if (!(target instanceof double[])) return false;
58        double[] targ = (double[]) target;
59        return (source.length == targ.length
60                && arrayRegionMatches(source, 0, targ, 0, source.length));
61    }
62    public final static boolean arrayEquals(byte[] source, Object target) {
63        if (source == null) return (target == null);
64        if (!(target instanceof byte[])) return false;
65        byte[] targ = (byte[]) target;
66        return (source.length == targ.length
67                && arrayRegionMatches(source, 0, targ, 0, source.length));
68    }
69
70    /**
71     * Convenience utility to compare two Object[]s
72     * Ought to be in System
73     */
74    public final static boolean arrayEquals(Object source, Object target) {
75        if (source == null) return (target == null);
76        // for some reason, the correct arrayEquals is not being called
77        // so do it by hand for now.
78        if (source instanceof Object[])
79            return(arrayEquals((Object[]) source,target));
80        if (source instanceof int[])
81            return(arrayEquals((int[]) source,target));
82        if (source instanceof double[])
83            return(arrayEquals((double[]) source, target));
84        if (source instanceof byte[])
85            return(arrayEquals((byte[]) source,target));
86        return source.equals(target);
87    }
88
89    /**
90     * Convenience utility to compare two Object[]s
91     * Ought to be in System.
92     * @param len the length to compare.
93     * The start indices and start+len must be valid.
94     */
95    public final static boolean arrayRegionMatches(Object[] source, int sourceStart,
96            Object[] target, int targetStart,
97            int len)
98    {
99        int sourceEnd = sourceStart + len;
100        int delta = targetStart - sourceStart;
101        for (int i = sourceStart; i < sourceEnd; i++) {
102            if (!arrayEquals(source[i],target[i + delta]))
103                return false;
104        }
105        return true;
106    }
107
108    /**
109     * Convenience utility to compare two Object[]s
110     * Ought to be in System.
111     * @param len the length to compare.
112     * The start indices and start+len must be valid.
113     */
114    public final static boolean arrayRegionMatches(char[] source, int sourceStart,
115            char[] target, int targetStart,
116            int len)
117    {
118        int sourceEnd = sourceStart + len;
119        int delta = targetStart - sourceStart;
120        for (int i = sourceStart; i < sourceEnd; i++) {
121            if (source[i]!=target[i + delta])
122                return false;
123        }
124        return true;
125    }
126
127    /**
128     * Convenience utility to compare two int[]s.
129     * @param len the length to compare.
130     * The start indices and start+len must be valid.
131     * Ought to be in System
132     */
133    public final static boolean arrayRegionMatches(int[] source, int sourceStart,
134            int[] target, int targetStart,
135            int len)
136    {
137        int sourceEnd = sourceStart + len;
138        int delta = targetStart - sourceStart;
139        for (int i = sourceStart; i < sourceEnd; i++) {
140            if (source[i] != target[i + delta])
141                return false;
142        }
143        return true;
144    }
145
146    /**
147     * Convenience utility to compare two arrays of doubles.
148     * @param len the length to compare.
149     * The start indices and start+len must be valid.
150     * Ought to be in System
151     */
152    public final static boolean arrayRegionMatches(double[] source, int sourceStart,
153            double[] target, int targetStart,
154            int len)
155    {
156        int sourceEnd = sourceStart + len;
157        int delta = targetStart - sourceStart;
158        for (int i = sourceStart; i < sourceEnd; i++) {
159            if (source[i] != target[i + delta])
160                return false;
161        }
162        return true;
163    }
164    public final static boolean arrayRegionMatches(byte[] source, int sourceStart,
165            byte[] target, int targetStart, int len){
166        int sourceEnd = sourceStart + len;
167        int delta = targetStart - sourceStart;
168        for (int i = sourceStart; i < sourceEnd; i++) {
169            if (source[i] != target[i + delta])
170                return false;
171        }
172        return true;
173    }
174
175    /**
176     * Trivial reference equality.
177     * This method should help document that we really want == not equals(),
178     * and to have a single place to suppress warnings from static analysis tools.
179     */
180    public static final boolean sameObjects(Object a, Object b) {
181        return a == b;
182    }
183
184    /**
185     * Convenience utility. Does null checks on objects, then calls equals.
186     */
187    public final static boolean objectEquals(Object a, Object b) {
188        return a == null ?
189                b == null ? true : false :
190                    b == null ? false : a.equals(b);
191    }
192
193    /**
194     * Convenience utility. Does null checks on objects, then calls compare.
195     */
196    public static <T extends Comparable<T>> int checkCompare(T a, T b) {
197        return a == null ?
198                b == null ? 0 : -1 :
199                    b == null ? 1 : a.compareTo(b);
200      }
201
202    /**
203     * Convenience utility. Does null checks on object, then calls hashCode.
204     */
205    public static int checkHash(Object a) {
206        return a == null ? 0 : a.hashCode();
207      }
208
209    /**
210     * The ESCAPE character is used during run-length encoding.  It signals
211     * a run of identical chars.
212     */
213    private static final char ESCAPE = '\uA5A5';
214
215    /**
216     * The ESCAPE_BYTE character is used during run-length encoding.  It signals
217     * a run of identical bytes.
218     */
219    static final byte ESCAPE_BYTE = (byte)0xA5;
220
221    /**
222     * Construct a string representing an int array.  Use run-length encoding.
223     * A character represents itself, unless it is the ESCAPE character.  Then
224     * the following notations are possible:
225     *   ESCAPE ESCAPE   ESCAPE literal
226     *   ESCAPE n c      n instances of character c
227     * Since an encoded run occupies 3 characters, we only encode runs of 4 or
228     * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
229     * If we encounter a run where n == ESCAPE, we represent this as:
230     *   c ESCAPE n-1 c
231     * The ESCAPE value is chosen so as not to collide with commonly
232     * seen values.
233     */
234    static public final String arrayToRLEString(int[] a) {
235        StringBuilder buffer = new StringBuilder();
236
237        appendInt(buffer, a.length);
238        int runValue = a[0];
239        int runLength = 1;
240        for (int i=1; i<a.length; ++i) {
241            int s = a[i];
242            if (s == runValue && runLength < 0xFFFF) {
243                ++runLength;
244            } else {
245                encodeRun(buffer, runValue, runLength);
246                runValue = s;
247                runLength = 1;
248            }
249        }
250        encodeRun(buffer, runValue, runLength);
251        return buffer.toString();
252    }
253
254    /**
255     * Construct a string representing a short array.  Use run-length encoding.
256     * A character represents itself, unless it is the ESCAPE character.  Then
257     * the following notations are possible:
258     *   ESCAPE ESCAPE   ESCAPE literal
259     *   ESCAPE n c      n instances of character c
260     * Since an encoded run occupies 3 characters, we only encode runs of 4 or
261     * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
262     * If we encounter a run where n == ESCAPE, we represent this as:
263     *   c ESCAPE n-1 c
264     * The ESCAPE value is chosen so as not to collide with commonly
265     * seen values.
266     */
267    static public final String arrayToRLEString(short[] a) {
268        StringBuilder buffer = new StringBuilder();
269        // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
270        buffer.append((char) (a.length >> 16));
271        buffer.append((char) a.length);
272        short runValue = a[0];
273        int runLength = 1;
274        for (int i=1; i<a.length; ++i) {
275            short s = a[i];
276            if (s == runValue && runLength < 0xFFFF) ++runLength;
277            else {
278                encodeRun(buffer, runValue, runLength);
279                runValue = s;
280                runLength = 1;
281            }
282        }
283        encodeRun(buffer, runValue, runLength);
284        return buffer.toString();
285    }
286
287    /**
288     * Construct a string representing a char array.  Use run-length encoding.
289     * A character represents itself, unless it is the ESCAPE character.  Then
290     * the following notations are possible:
291     *   ESCAPE ESCAPE   ESCAPE literal
292     *   ESCAPE n c      n instances of character c
293     * Since an encoded run occupies 3 characters, we only encode runs of 4 or
294     * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
295     * If we encounter a run where n == ESCAPE, we represent this as:
296     *   c ESCAPE n-1 c
297     * The ESCAPE value is chosen so as not to collide with commonly
298     * seen values.
299     */
300    static public final String arrayToRLEString(char[] a) {
301        StringBuilder buffer = new StringBuilder();
302        buffer.append((char) (a.length >> 16));
303        buffer.append((char) a.length);
304        char runValue = a[0];
305        int runLength = 1;
306        for (int i=1; i<a.length; ++i) {
307            char s = a[i];
308            if (s == runValue && runLength < 0xFFFF) ++runLength;
309            else {
310                encodeRun(buffer, (short)runValue, runLength);
311                runValue = s;
312                runLength = 1;
313            }
314        }
315        encodeRun(buffer, (short)runValue, runLength);
316        return buffer.toString();
317    }
318
319    /**
320     * Construct a string representing a byte array.  Use run-length encoding.
321     * Two bytes are packed into a single char, with a single extra zero byte at
322     * the end if needed.  A byte represents itself, unless it is the
323     * ESCAPE_BYTE.  Then the following notations are possible:
324     *   ESCAPE_BYTE ESCAPE_BYTE   ESCAPE_BYTE literal
325     *   ESCAPE_BYTE n b           n instances of byte b
326     * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
327     * more bytes.  Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
328     * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
329     *   b ESCAPE_BYTE n-1 b
330     * The ESCAPE_BYTE value is chosen so as not to collide with commonly
331     * seen values.
332     */
333    static public final String arrayToRLEString(byte[] a) {
334        StringBuilder buffer = new StringBuilder();
335        buffer.append((char) (a.length >> 16));
336        buffer.append((char) a.length);
337        byte runValue = a[0];
338        int runLength = 1;
339        byte[] state = new byte[2];
340        for (int i=1; i<a.length; ++i) {
341            byte b = a[i];
342            if (b == runValue && runLength < 0xFF) ++runLength;
343            else {
344                encodeRun(buffer, runValue, runLength, state);
345                runValue = b;
346                runLength = 1;
347            }
348        }
349        encodeRun(buffer, runValue, runLength, state);
350
351        // We must save the final byte, if there is one, by padding
352        // an extra zero.
353        if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state);
354
355        return buffer.toString();
356    }
357
358    /**
359     * Encode a run, possibly a degenerate run (of < 4 values).
360     * @param length The length of the run; must be > 0 && <= 0xFFFF.
361     */
362    private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) {
363        if (length < 4) {
364            for (int j=0; j<length; ++j) {
365                if (value == ESCAPE) {
366                    appendInt(buffer, value);
367                }
368                appendInt(buffer, value);
369            }
370        }
371        else {
372            if (length == ESCAPE) {
373                if (value == ESCAPE) {
374                    appendInt(buffer, ESCAPE);
375                }
376                appendInt(buffer, value);
377                --length;
378            }
379            appendInt(buffer, ESCAPE);
380            appendInt(buffer, length);
381            appendInt(buffer, value); // Don't need to escape this value
382        }
383    }
384
385    private static final <T extends Appendable> void appendInt(T buffer, int value) {
386        try {
387            buffer.append((char)(value >>> 16));
388            buffer.append((char)(value & 0xFFFF));
389        } catch (IOException e) {
390            throw new IllegalIcuArgumentException(e);
391        }
392    }
393
394    /**
395     * Encode a run, possibly a degenerate run (of < 4 values).
396     * @param length The length of the run; must be > 0 && <= 0xFFFF.
397     */
398    private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) {
399        try {
400            char valueChar = (char) value;
401            if (length < 4) {
402                for (int j=0; j<length; ++j) {
403                    if (valueChar == ESCAPE) {
404                        buffer.append(ESCAPE);
405                    }
406                    buffer.append(valueChar);
407                }
408            }
409            else {
410                if (length == ESCAPE) {
411                    if (valueChar == ESCAPE) {
412                        buffer.append(ESCAPE);
413                    }
414                    buffer.append(valueChar);
415                    --length;
416                }
417                buffer.append(ESCAPE);
418                buffer.append((char) length);
419                buffer.append(valueChar); // Don't need to escape this value
420            }
421        } catch (IOException e) {
422            throw new IllegalIcuArgumentException(e);
423        }
424    }
425
426    /**
427     * Encode a run, possibly a degenerate run (of < 4 values).
428     * @param length The length of the run; must be > 0 && <= 0xFF.
429     */
430    private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length,
431            byte[] state) {
432        if (length < 4) {
433            for (int j=0; j<length; ++j) {
434                if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
435                appendEncodedByte(buffer, value, state);
436            }
437        }
438        else {
439            if ((byte)length == ESCAPE_BYTE) {
440                if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
441                appendEncodedByte(buffer, value, state);
442                --length;
443            }
444            appendEncodedByte(buffer, ESCAPE_BYTE, state);
445            appendEncodedByte(buffer, (byte)length, state);
446            appendEncodedByte(buffer, value, state); // Don't need to escape this value
447        }
448    }
449
450    /**
451     * Append a byte to the given Appendable, packing two bytes into each
452     * character.  The state parameter maintains intermediary data between
453     * calls.
454     * @param state A two-element array, with state[0] == 0 if this is the
455     * first byte of a pair, or state[0] != 0 if this is the second byte
456     * of a pair, in which case state[1] is the first byte.
457     */
458    private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value,
459            byte[] state) {
460        try {
461            if (state[0] != 0) {
462                char c = (char) ((state[1] << 8) | ((value) & 0xFF));
463                buffer.append(c);
464                state[0] = 0;
465            }
466            else {
467                state[0] = 1;
468                state[1] = value;
469            }
470        } catch (IOException e) {
471            throw new IllegalIcuArgumentException(e);
472        }
473    }
474
475    /**
476     * Construct an array of ints from a run-length encoded string.
477     */
478    static public final int[] RLEStringToIntArray(String s) {
479        int length = getInt(s, 0);
480        int[] array = new int[length];
481        int ai = 0, i = 1;
482
483        int maxI = s.length() / 2;
484        while (ai < length && i < maxI) {
485            int c = getInt(s, i++);
486
487            if (c == ESCAPE) {
488                c = getInt(s, i++);
489                if (c == ESCAPE) {
490                    array[ai++] = c;
491                } else {
492                    int runLength = c;
493                    int runValue = getInt(s, i++);
494                    for (int j=0; j<runLength; ++j) {
495                        array[ai++] = runValue;
496                    }
497                }
498            }
499            else {
500                array[ai++] = c;
501            }
502        }
503
504        if (ai != length || i != maxI) {
505            throw new IllegalStateException("Bad run-length encoded int array");
506        }
507
508        return array;
509    }
510    static final int getInt(String s, int i) {
511        return ((s.charAt(2*i)) << 16) | s.charAt(2*i+1);
512    }
513
514    /**
515     * Construct an array of shorts from a run-length encoded string.
516     */
517    static public final short[] RLEStringToShortArray(String s) {
518        int length = ((s.charAt(0)) << 16) | (s.charAt(1));
519        short[] array = new short[length];
520        int ai = 0;
521        for (int i=2; i<s.length(); ++i) {
522            char c = s.charAt(i);
523            if (c == ESCAPE) {
524                c = s.charAt(++i);
525                if (c == ESCAPE) {
526                    array[ai++] = (short) c;
527                } else {
528                    int runLength = c;
529                    short runValue = (short) s.charAt(++i);
530                    for (int j=0; j<runLength; ++j) array[ai++] = runValue;
531                }
532            }
533            else {
534                array[ai++] = (short) c;
535            }
536        }
537
538        if (ai != length)
539            throw new IllegalStateException("Bad run-length encoded short array");
540
541        return array;
542    }
543
544    /**
545     * Construct an array of shorts from a run-length encoded string.
546     */
547    static public final char[] RLEStringToCharArray(String s) {
548        int length = ((s.charAt(0)) << 16) | (s.charAt(1));
549        char[] array = new char[length];
550        int ai = 0;
551        for (int i=2; i<s.length(); ++i) {
552            char c = s.charAt(i);
553            if (c == ESCAPE) {
554                c = s.charAt(++i);
555                if (c == ESCAPE) {
556                    array[ai++] = c;
557                } else {
558                    int runLength = c;
559                    char runValue = s.charAt(++i);
560                    for (int j=0; j<runLength; ++j) array[ai++] = runValue;
561                }
562            }
563            else {
564                array[ai++] = c;
565            }
566        }
567
568        if (ai != length)
569            throw new IllegalStateException("Bad run-length encoded short array");
570
571        return array;
572    }
573
574    /**
575     * Construct an array of bytes from a run-length encoded string.
576     */
577    static public final byte[] RLEStringToByteArray(String s) {
578        int length = ((s.charAt(0)) << 16) | (s.charAt(1));
579        byte[] array = new byte[length];
580        boolean nextChar = true;
581        char c = 0;
582        int node = 0;
583        int runLength = 0;
584        int i = 2;
585        for (int ai=0; ai<length; ) {
586            // This part of the loop places the next byte into the local
587            // variable 'b' each time through the loop.  It keeps the
588            // current character in 'c' and uses the boolean 'nextChar'
589            // to see if we've taken both bytes out of 'c' yet.
590            byte b;
591            if (nextChar) {
592                c = s.charAt(i++);
593                b = (byte) (c >> 8);
594                nextChar = false;
595            }
596            else {
597                b = (byte) (c & 0xFF);
598                nextChar = true;
599            }
600
601            // This part of the loop is a tiny state machine which handles
602            // the parsing of the run-length encoding.  This would be simpler
603            // if we could look ahead, but we can't, so we use 'node' to
604            // move between three nodes in the state machine.
605            switch (node) {
606            case 0:
607                // Normal idle node
608                if (b == ESCAPE_BYTE) {
609                    node = 1;
610                }
611                else {
612                    array[ai++] = b;
613                }
614                break;
615            case 1:
616                // We have seen one ESCAPE_BYTE; we expect either a second
617                // one, or a run length and value.
618                if (b == ESCAPE_BYTE) {
619                    array[ai++] = ESCAPE_BYTE;
620                    node = 0;
621                }
622                else {
623                    runLength = b;
624                    // Interpret signed byte as unsigned
625                    if (runLength < 0) runLength += 0x100;
626                    node = 2;
627                }
628                break;
629            case 2:
630                // We have seen an ESCAPE_BYTE and length byte.  We interpret
631                // the next byte as the value to be repeated.
632                for (int j=0; j<runLength; ++j) array[ai++] = b;
633                node = 0;
634                break;
635            }
636        }
637
638        if (node != 0)
639            throw new IllegalStateException("Bad run-length encoded byte array");
640
641        if (i != s.length())
642            throw new IllegalStateException("Excess data in RLE byte array string");
643
644        return array;
645    }
646
647    static public String LINE_SEPARATOR = System.getProperty("line.separator");
648
649    /**
650     * Format a String for representation in a source file.  This includes
651     * breaking it into lines and escaping characters using octal notation
652     * when necessary (control characters and double quotes).
653     */
654    static public final String formatForSource(String s) {
655        StringBuilder buffer = new StringBuilder();
656        for (int i=0; i<s.length();) {
657            if (i > 0) buffer.append('+').append(LINE_SEPARATOR);
658            buffer.append("        \"");
659            int count = 11;
660            while (i<s.length() && count<80) {
661                char c = s.charAt(i++);
662                if (c < '\u0020' || c == '"' || c == '\\') {
663                    if (c == '\n') {
664                        buffer.append("\\n");
665                        count += 2;
666                    } else if (c == '\t') {
667                        buffer.append("\\t");
668                        count += 2;
669                    } else if (c == '\r') {
670                        buffer.append("\\r");
671                        count += 2;
672                    } else {
673                        // Represent control characters, backslash and double quote
674                        // using octal notation; otherwise the string we form
675                        // won't compile, since Unicode escape sequences are
676                        // processed before tokenization.
677                        buffer.append('\\');
678                        buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
679                        buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
680                        buffer.append(HEX_DIGIT[(c & 0007)]);
681                        count += 4;
682                    }
683                }
684                else if (c <= '\u007E') {
685                    buffer.append(c);
686                    count += 1;
687                }
688                else {
689                    buffer.append("\\u");
690                    buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
691                    buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
692                    buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
693                    buffer.append(HEX_DIGIT[(c & 0x000F)]);
694                    count += 6;
695                }
696            }
697            buffer.append('"');
698        }
699        return buffer.toString();
700    }
701
702    static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7',
703        '8','9','A','B','C','D','E','F'};
704
705    /**
706     * Format a String for representation in a source file.  Like
707     * formatForSource but does not do line breaking.
708     */
709    static public final String format1ForSource(String s) {
710        StringBuilder buffer = new StringBuilder();
711        buffer.append("\"");
712        for (int i=0; i<s.length();) {
713            char c = s.charAt(i++);
714            if (c < '\u0020' || c == '"' || c == '\\') {
715                if (c == '\n') {
716                    buffer.append("\\n");
717                } else if (c == '\t') {
718                    buffer.append("\\t");
719                } else if (c == '\r') {
720                    buffer.append("\\r");
721                } else {
722                    // Represent control characters, backslash and double quote
723                    // using octal notation; otherwise the string we form
724                    // won't compile, since Unicode escape sequences are
725                    // processed before tokenization.
726                    buffer.append('\\');
727                    buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
728                    buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
729                    buffer.append(HEX_DIGIT[(c & 0007)]);
730                }
731            }
732            else if (c <= '\u007E') {
733                buffer.append(c);
734            }
735            else {
736                buffer.append("\\u");
737                buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
738                buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
739                buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
740                buffer.append(HEX_DIGIT[(c & 0x000F)]);
741            }
742        }
743        buffer.append('"');
744        return buffer.toString();
745    }
746
747    /**
748     * Convert characters outside the range U+0020 to U+007F to
749     * Unicode escapes, and convert backslash to a double backslash.
750     */
751    public static final String escape(String s) {
752        StringBuilder buf = new StringBuilder();
753        for (int i=0; i<s.length(); ) {
754            int c = Character.codePointAt(s, i);
755            i += UTF16.getCharCount(c);
756            if (c >= ' ' && c <= 0x007F) {
757                if (c == '\\') {
758                    buf.append("\\\\"); // That is, "\\"
759                } else {
760                    buf.append((char)c);
761                }
762            } else {
763                boolean four = c <= 0xFFFF;
764                buf.append(four ? "\\u" : "\\U");
765                buf.append(hex(c, four ? 4 : 8));
766            }
767        }
768        return buf.toString();
769    }
770
771    /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
772    static private final char[] UNESCAPE_MAP = {
773        /*"   0x22, 0x22 */
774        /*'   0x27, 0x27 */
775        /*?   0x3F, 0x3F */
776        /*\   0x5C, 0x5C */
777        /*a*/ 0x61, 0x07,
778        /*b*/ 0x62, 0x08,
779        /*e*/ 0x65, 0x1b,
780        /*f*/ 0x66, 0x0c,
781        /*n*/ 0x6E, 0x0a,
782        /*r*/ 0x72, 0x0d,
783        /*t*/ 0x74, 0x09,
784        /*v*/ 0x76, 0x0b
785    };
786
787    /**
788     * Convert an escape to a 32-bit code point value.  We attempt
789     * to parallel the icu4c unescapeAt() function.
790     * @param offset16 an array containing offset to the character
791     * <em>after</em> the backslash.  Upon return offset16[0] will
792     * be updated to point after the escape sequence.
793     * @return character value from 0 to 10FFFF, or -1 on error.
794     */
795    public static int unescapeAt(String s, int[] offset16) {
796        int c;
797        int result = 0;
798        int n = 0;
799        int minDig = 0;
800        int maxDig = 0;
801        int bitsPerDigit = 4;
802        int dig;
803        int i;
804        boolean braces = false;
805
806        /* Check that offset is in range */
807        int offset = offset16[0];
808        int length = s.length();
809        if (offset < 0 || offset >= length) {
810            return -1;
811        }
812
813        /* Fetch first UChar after '\\' */
814        c = Character.codePointAt(s, offset);
815        offset += UTF16.getCharCount(c);
816
817        /* Convert hexadecimal and octal escapes */
818        switch (c) {
819        case 'u':
820            minDig = maxDig = 4;
821            break;
822        case 'U':
823            minDig = maxDig = 8;
824            break;
825        case 'x':
826            minDig = 1;
827            if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
828                ++offset;
829                braces = true;
830                maxDig = 8;
831            } else {
832                maxDig = 2;
833            }
834            break;
835        default:
836            dig = UCharacter.digit(c, 8);
837            if (dig >= 0) {
838                minDig = 1;
839                maxDig = 3;
840                n = 1; /* Already have first octal digit */
841                bitsPerDigit = 3;
842                result = dig;
843            }
844            break;
845        }
846        if (minDig != 0) {
847            while (offset < length && n < maxDig) {
848                c = UTF16.charAt(s, offset);
849                dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
850                if (dig < 0) {
851                    break;
852                }
853                result = (result << bitsPerDigit) | dig;
854                offset += UTF16.getCharCount(c);
855                ++n;
856            }
857            if (n < minDig) {
858                return -1;
859            }
860            if (braces) {
861                if (c != 0x7D /*}*/) {
862                    return -1;
863                }
864                ++offset;
865            }
866            if (result < 0 || result >= 0x110000) {
867                return -1;
868            }
869            // If an escape sequence specifies a lead surrogate, see
870            // if there is a trail surrogate after it, either as an
871            // escape or as a literal.  If so, join them up into a
872            // supplementary.
873            if (offset < length &&
874                    UTF16.isLeadSurrogate((char) result)) {
875                int ahead = offset+1;
876                c = s.charAt(offset); // [sic] get 16-bit code unit
877                if (c == '\\' && ahead < length) {
878                    int o[] = new int[] { ahead };
879                    c = unescapeAt(s, o);
880                    ahead = o[0];
881                }
882                if (UTF16.isTrailSurrogate((char) c)) {
883                    offset = ahead;
884                    result = Character.toCodePoint((char) result, (char) c);
885                }
886            }
887            offset16[0] = offset;
888            return result;
889        }
890
891        /* Convert C-style escapes in table */
892        for (i=0; i<UNESCAPE_MAP.length; i+=2) {
893            if (c == UNESCAPE_MAP[i]) {
894                offset16[0] = offset;
895                return UNESCAPE_MAP[i+1];
896            } else if (c < UNESCAPE_MAP[i]) {
897                break;
898            }
899        }
900
901        /* Map \cX to control-X: X & 0x1F */
902        if (c == 'c' && offset < length) {
903            c = UTF16.charAt(s, offset);
904            offset16[0] = offset + UTF16.getCharCount(c);
905            return 0x1F & c;
906        }
907
908        /* If no special forms are recognized, then consider
909         * the backslash to generically escape the next character. */
910        offset16[0] = offset;
911        return c;
912    }
913
914    /**
915     * Convert all escapes in a given string using unescapeAt().
916     * @exception IllegalArgumentException if an invalid escape is
917     * seen.
918     */
919    public static String unescape(String s) {
920        StringBuilder buf = new StringBuilder();
921        int[] pos = new int[1];
922        for (int i=0; i<s.length(); ) {
923            char c = s.charAt(i++);
924            if (c == '\\') {
925                pos[0] = i;
926                int e = unescapeAt(s, pos);
927                if (e < 0) {
928                    throw new IllegalArgumentException("Invalid escape sequence " +
929                            s.substring(i-1, Math.min(i+8, s.length())));
930                }
931                buf.appendCodePoint(e);
932                i = pos[0];
933            } else {
934                buf.append(c);
935            }
936        }
937        return buf.toString();
938    }
939
940    /**
941     * Convert all escapes in a given string using unescapeAt().
942     * Leave invalid escape sequences unchanged.
943     */
944    public static String unescapeLeniently(String s) {
945        StringBuilder buf = new StringBuilder();
946        int[] pos = new int[1];
947        for (int i=0; i<s.length(); ) {
948            char c = s.charAt(i++);
949            if (c == '\\') {
950                pos[0] = i;
951                int e = unescapeAt(s, pos);
952                if (e < 0) {
953                    buf.append(c);
954                } else {
955                    buf.appendCodePoint(e);
956                    i = pos[0];
957                }
958            } else {
959                buf.append(c);
960            }
961        }
962        return buf.toString();
963    }
964
965    /**
966     * Convert a char to 4 hex uppercase digits.  E.g., hex('a') =>
967     * "0041".
968     */
969    public static String hex(long ch) {
970        return hex(ch, 4);
971    }
972
973    /**
974     * Supplies a zero-padded hex representation of an integer (without 0x)
975     */
976    static public String hex(long i, int places) {
977        if (i == Long.MIN_VALUE) return "-8000000000000000";
978        boolean negative = i < 0;
979        if (negative) {
980            i = -i;
981        }
982        String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
983        if (result.length() < places) {
984            result = "0000000000000000".substring(result.length(),places) + result;
985        }
986        if (negative) {
987            return '-' + result;
988        }
989        return result;
990    }
991
992    /**
993     * Convert a string to comma-separated groups of 4 hex uppercase
994     * digits.  E.g., hex('ab') => "0041,0042".
995     */
996    public static String hex(CharSequence s) {
997        return hex(s, 4, ",", true, new StringBuilder()).toString();
998    }
999
1000    /**
1001     * Convert a string to separated groups of hex uppercase
1002     * digits.  E.g., hex('ab'...) => "0041,0042".  Append the output
1003     * to the given Appendable.
1004     */
1005    public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
1006        try {
1007            if (useCodePoints) {
1008                int cp;
1009                for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1010                    cp = Character.codePointAt(s, i);
1011                    if (i != 0) {
1012                        result.append(separator);
1013                    }
1014                    result.append(hex(cp,width));
1015                }
1016            } else {
1017                for (int i = 0; i < s.length(); ++i) {
1018                    if (i != 0) {
1019                        result.append(separator);
1020                    }
1021                    result.append(hex(s.charAt(i),width));
1022                }
1023            }
1024            return result;
1025        } catch (IOException e) {
1026            throw new IllegalIcuArgumentException(e);
1027        }
1028    }
1029
1030    public static String hex(byte[] o, int start, int end, String separator) {
1031        StringBuilder result = new StringBuilder();
1032        //int ch;
1033        for (int i = start; i < end; ++i) {
1034          if (i != 0) result.append(separator);
1035          result.append(hex(o[i]));
1036        }
1037        return result.toString();
1038      }
1039
1040    /**
1041     * Convert a string to comma-separated groups of 4 hex uppercase
1042     * digits.  E.g., hex('ab') => "0041,0042".
1043     */
1044    public static <S extends CharSequence> String hex(S s, int width, S separator) {
1045        return hex(s, width, separator, true, new StringBuilder()).toString();
1046    }
1047
1048    /**
1049     * Split a string into pieces based on the given divider character
1050     * @param s the string to split
1051     * @param divider the character on which to split.  Occurrences of
1052     * this character are not included in the output
1053     * @param output an array to receive the substrings between
1054     * instances of divider.  It must be large enough on entry to
1055     * accomodate all output.  Adjacent instances of the divider
1056     * character will place empty strings into output.  Before
1057     * returning, output is padded out with empty strings.
1058     */
1059    public static void split(String s, char divider, String[] output) {
1060        int last = 0;
1061        int current = 0;
1062        int i;
1063        for (i = 0; i < s.length(); ++i) {
1064            if (s.charAt(i) == divider) {
1065                output[current++] = s.substring(last,i);
1066                last = i+1;
1067            }
1068        }
1069        output[current++] = s.substring(last,i);
1070        while (current < output.length) {
1071            output[current++] = "";
1072        }
1073    }
1074
1075    /**
1076     * Split a string into pieces based on the given divider character
1077     * @param s the string to split
1078     * @param divider the character on which to split.  Occurrences of
1079     * this character are not included in the output
1080     * @return output an array to receive the substrings between
1081     * instances of divider. Adjacent instances of the divider
1082     * character will place empty strings into output.
1083     */
1084    public static String[] split(String s, char divider) {
1085        int last = 0;
1086        int i;
1087        ArrayList<String> output = new ArrayList<String>();
1088        for (i = 0; i < s.length(); ++i) {
1089            if (s.charAt(i) == divider) {
1090                output.add(s.substring(last,i));
1091                last = i+1;
1092            }
1093        }
1094        output.add( s.substring(last,i));
1095        return output.toArray(new String[output.size()]);
1096    }
1097
1098    /**
1099     * Look up a given string in a string array.  Returns the index at
1100     * which the first occurrence of the string was found in the
1101     * array, or -1 if it was not found.
1102     * @param source the string to search for
1103     * @param target the array of zero or more strings in which to
1104     * look for source
1105     * @return the index of target at which source first occurs, or -1
1106     * if not found
1107     */
1108    public static int lookup(String source, String[] target) {
1109        for (int i = 0; i < target.length; ++i) {
1110            if (source.equals(target[i])) return i;
1111        }
1112        return -1;
1113    }
1114
1115    /**
1116     * Parse a single non-whitespace character 'ch', optionally
1117     * preceded by whitespace.
1118     * @param id the string to be parsed
1119     * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
1120     * offset of the first character to be parsed.  On output, pos[0]
1121     * is the index after the last parsed character.  If the parse
1122     * fails, pos[0] will be unchanged.
1123     * @param ch the non-whitespace character to be parsed.
1124     * @return true if 'ch' is seen preceded by zero or more
1125     * whitespace characters.
1126     */
1127    public static boolean parseChar(String id, int[] pos, char ch) {
1128        int start = pos[0];
1129        pos[0] = PatternProps.skipWhiteSpace(id, pos[0]);
1130        if (pos[0] == id.length() ||
1131                id.charAt(pos[0]) != ch) {
1132            pos[0] = start;
1133            return false;
1134        }
1135        ++pos[0];
1136        return true;
1137    }
1138
1139    /**
1140     * Parse a pattern string starting at offset pos.  Keywords are
1141     * matched case-insensitively.  Spaces may be skipped and may be
1142     * optional or required.  Integer values may be parsed, and if
1143     * they are, they will be returned in the given array.  If
1144     * successful, the offset of the next non-space character is
1145     * returned.  On failure, -1 is returned.
1146     * @param pattern must only contain lowercase characters, which
1147     * will match their uppercase equivalents as well.  A space
1148     * character matches one or more required spaces.  A '~' character
1149     * matches zero or more optional spaces.  A '#' character matches
1150     * an integer and stores it in parsedInts, which the caller must
1151     * ensure has enough capacity.
1152     * @param parsedInts array to receive parsed integers.  Caller
1153     * must ensure that parsedInts.length is >= the number of '#'
1154     * signs in 'pattern'.
1155     * @return the position after the last character parsed, or -1 if
1156     * the parse failed
1157     */
1158    @SuppressWarnings("fallthrough")
1159    public static int parsePattern(String rule, int pos, int limit,
1160            String pattern, int[] parsedInts) {
1161        // TODO Update this to handle surrogates
1162        int[] p = new int[1];
1163        int intCount = 0; // number of integers parsed
1164        for (int i=0; i<pattern.length(); ++i) {
1165            char cpat = pattern.charAt(i);
1166            char c;
1167            switch (cpat) {
1168            case ' ':
1169                if (pos >= limit) {
1170                    return -1;
1171                }
1172                c = rule.charAt(pos++);
1173                if (!PatternProps.isWhiteSpace(c)) {
1174                    return -1;
1175                }
1176                // FALL THROUGH to skipWhitespace
1177            case '~':
1178                pos = PatternProps.skipWhiteSpace(rule, pos);
1179                break;
1180            case '#':
1181                p[0] = pos;
1182                parsedInts[intCount++] = parseInteger(rule, p, limit);
1183                if (p[0] == pos) {
1184                    // Syntax error; failed to parse integer
1185                    return -1;
1186                }
1187                pos = p[0];
1188                break;
1189            default:
1190                if (pos >= limit) {
1191                    return -1;
1192                }
1193                c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
1194                if (c != cpat) {
1195                    return -1;
1196                }
1197                break;
1198            }
1199        }
1200        return pos;
1201    }
1202
1203    /**
1204     * Parse a pattern string within the given Replaceable and a parsing
1205     * pattern.  Characters are matched literally and case-sensitively
1206     * except for the following special characters:
1207     *
1208     * ~  zero or more Pattern_White_Space chars
1209     *
1210     * If end of pattern is reached with all matches along the way,
1211     * pos is advanced to the first unparsed index and returned.
1212     * Otherwise -1 is returned.
1213     * @param pat pattern that controls parsing
1214     * @param text text to be parsed, starting at index
1215     * @param index offset to first character to parse
1216     * @param limit offset after last character to parse
1217     * @return index after last parsed character, or -1 on parse failure.
1218     */
1219    public static int parsePattern(String pat,
1220            Replaceable text,
1221            int index,
1222            int limit) {
1223        int ipat = 0;
1224
1225        // empty pattern matches immediately
1226        if (ipat == pat.length()) {
1227            return index;
1228        }
1229
1230        int cpat = Character.codePointAt(pat, ipat);
1231
1232        while (index < limit) {
1233            int c = text.char32At(index);
1234
1235            // parse \s*
1236            if (cpat == '~') {
1237                if (PatternProps.isWhiteSpace(c)) {
1238                    index += UTF16.getCharCount(c);
1239                    continue;
1240                } else {
1241                    if (++ipat == pat.length()) {
1242                        return index; // success; c unparsed
1243                    }
1244                    // fall thru; process c again with next cpat
1245                }
1246            }
1247
1248            // parse literal
1249            else if (c == cpat) {
1250                int n = UTF16.getCharCount(c);
1251                index += n;
1252                ipat += n;
1253                if (ipat == pat.length()) {
1254                    return index; // success; c parsed
1255                }
1256                // fall thru; get next cpat
1257            }
1258
1259            // match failure of literal
1260            else {
1261                return -1;
1262            }
1263
1264            cpat = UTF16.charAt(pat, ipat);
1265        }
1266
1267        return -1; // text ended before end of pat
1268    }
1269
1270    /**
1271     * Parse an integer at pos, either of the form \d+ or of the form
1272     * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
1273     * or octal format.
1274     * @param pos INPUT-OUTPUT parameter.  On input, the first
1275     * character to parse.  On output, the character after the last
1276     * parsed character.
1277     */
1278    public static int parseInteger(String rule, int[] pos, int limit) {
1279        int count = 0;
1280        int value = 0;
1281        int p = pos[0];
1282        int radix = 10;
1283
1284        if (rule.regionMatches(true, p, "0x", 0, 2)) {
1285            p += 2;
1286            radix = 16;
1287        } else if (p < limit && rule.charAt(p) == '0') {
1288            p++;
1289            count = 1;
1290            radix = 8;
1291        }
1292
1293        while (p < limit) {
1294            int d = UCharacter.digit(rule.charAt(p++), radix);
1295            if (d < 0) {
1296                --p;
1297                break;
1298            }
1299            ++count;
1300            int v = (value * radix) + d;
1301            if (v <= value) {
1302                // If there are too many input digits, at some point
1303                // the value will go negative, e.g., if we have seen
1304                // "0x8000000" already and there is another '0', when
1305                // we parse the next 0 the value will go negative.
1306                return 0;
1307            }
1308            value = v;
1309        }
1310        if (count > 0) {
1311            pos[0] = p;
1312        }
1313        return value;
1314    }
1315
1316    /**
1317     * Parse a Unicode identifier from the given string at the given
1318     * position.  Return the identifier, or null if there is no
1319     * identifier.
1320     * @param str the string to parse
1321     * @param pos INPUT-OUPUT parameter.  On INPUT, pos[0] is the
1322     * first character to examine.  It must be less than str.length(),
1323     * and it must not point to a whitespace character.  That is, must
1324     * have pos[0] < str.length().  On
1325     * OUTPUT, the position after the last parsed character.
1326     * @return the Unicode identifier, or null if there is no valid
1327     * identifier at pos[0].
1328     */
1329    public static String parseUnicodeIdentifier(String str, int[] pos) {
1330        // assert(pos[0] < str.length());
1331        StringBuilder buf = new StringBuilder();
1332        int p = pos[0];
1333        while (p < str.length()) {
1334            int ch = Character.codePointAt(str, p);
1335            if (buf.length() == 0) {
1336                if (UCharacter.isUnicodeIdentifierStart(ch)) {
1337                    buf.appendCodePoint(ch);
1338                } else {
1339                    return null;
1340                }
1341            } else {
1342                if (UCharacter.isUnicodeIdentifierPart(ch)) {
1343                    buf.appendCodePoint(ch);
1344                } else {
1345                    break;
1346                }
1347            }
1348            p += UTF16.getCharCount(ch);
1349        }
1350        pos[0] = p;
1351        return buf.toString();
1352    }
1353
1354    static final char DIGITS[] = {
1355        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
1356        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
1357        'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
1358        'U', 'V', 'W', 'X', 'Y', 'Z'
1359    };
1360
1361    /**
1362     * Append the digits of a positive integer to the given
1363     * <code>Appendable</code> in the given radix. This is
1364     * done recursively since it is easiest to generate the low-
1365     * order digit first, but it must be appended last.
1366     *
1367     * @param result is the <code>Appendable</code> to append to
1368     * @param n is the positive integer
1369     * @param radix is the radix, from 2 to 36 inclusive
1370     * @param minDigits is the minimum number of digits to append.
1371     */
1372    private static <T extends Appendable> void recursiveAppendNumber(T result, int n,
1373            int radix, int minDigits)
1374    {
1375        try {
1376            int digit = n % radix;
1377
1378            if (n >= radix || minDigits > 1) {
1379                recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
1380            }
1381            result.append(DIGITS[digit]);
1382        } catch (IOException e) {
1383            throw new IllegalIcuArgumentException(e);
1384        }
1385    }
1386
1387    /**
1388     * Append a number to the given Appendable in the given radix.
1389     * Standard digits '0'-'9' are used and letters 'A'-'Z' for
1390     * radices 11 through 36.
1391     * @param result the digits of the number are appended here
1392     * @param n the number to be converted to digits; may be negative.
1393     * If negative, a '-' is prepended to the digits.
1394     * @param radix a radix from 2 to 36 inclusive.
1395     * @param minDigits the minimum number of digits, not including
1396     * any '-', to produce.  Values less than 2 have no effect.  One
1397     * digit is always emitted regardless of this parameter.
1398     * @return a reference to result
1399     */
1400    public static <T extends Appendable> T appendNumber(T result, int n,
1401            int radix, int minDigits)
1402    {
1403        try {
1404            if (radix < 2 || radix > 36) {
1405                throw new IllegalArgumentException("Illegal radix " + radix);
1406            }
1407
1408
1409            int abs = n;
1410
1411            if (n < 0) {
1412                abs = -n;
1413                result.append("-");
1414            }
1415
1416            recursiveAppendNumber(result, abs, radix, minDigits);
1417
1418            return result;
1419        } catch (IOException e) {
1420            throw new IllegalIcuArgumentException(e);
1421        }
1422
1423    }
1424
1425    /**
1426     * Parse an unsigned 31-bit integer at the given offset.  Use
1427     * UCharacter.digit() to parse individual characters into digits.
1428     * @param text the text to be parsed
1429     * @param pos INPUT-OUTPUT parameter.  On entry, pos[0] is the
1430     * offset within text at which to start parsing; it should point
1431     * to a valid digit.  On exit, pos[0] is the offset after the last
1432     * parsed character.  If the parse failed, it will be unchanged on
1433     * exit.  Must be >= 0 on entry.
1434     * @param radix the radix in which to parse; must be >= 2 and <=
1435     * 36.
1436     * @return a non-negative parsed number, or -1 upon parse failure.
1437     * Parse fails if there are no digits, that is, if pos[0] does not
1438     * point to a valid digit on entry, or if the number to be parsed
1439     * does not fit into a 31-bit unsigned integer.
1440     */
1441    public static int parseNumber(String text, int[] pos, int radix) {
1442        // assert(pos[0] >= 0);
1443        // assert(radix >= 2);
1444        // assert(radix <= 36);
1445        int n = 0;
1446        int p = pos[0];
1447        while (p < text.length()) {
1448            int ch = Character.codePointAt(text, p);
1449            int d = UCharacter.digit(ch, radix);
1450            if (d < 0) {
1451                break;
1452            }
1453            n = radix*n + d;
1454            // ASSUME that when a 32-bit integer overflows it becomes
1455            // negative.  E.g., 214748364 * 10 + 8 => negative value.
1456            if (n < 0) {
1457                return -1;
1458            }
1459            ++p;
1460        }
1461        if (p == pos[0]) {
1462            return -1;
1463        }
1464        pos[0] = p;
1465        return n;
1466    }
1467
1468    /**
1469     * Return true if the character is NOT printable ASCII.  The tab,
1470     * newline and linefeed characters are considered unprintable.
1471     */
1472    public static boolean isUnprintable(int c) {
1473        //0x20 = 32 and 0x7E = 126
1474        return !(c >= 0x20 && c <= 0x7E);
1475    }
1476
1477    /**
1478     * Escape unprintable characters using <backslash>uxxxx notation
1479     * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
1480     * above.  If the character is printable ASCII, then do nothing
1481     * and return FALSE.  Otherwise, append the escaped notation and
1482     * return TRUE.
1483     */
1484    public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
1485        try {
1486            if (isUnprintable(c)) {
1487                result.append('\\');
1488                if ((c & ~0xFFFF) != 0) {
1489                    result.append('U');
1490                    result.append(DIGITS[0xF&(c>>28)]);
1491                    result.append(DIGITS[0xF&(c>>24)]);
1492                    result.append(DIGITS[0xF&(c>>20)]);
1493                    result.append(DIGITS[0xF&(c>>16)]);
1494                } else {
1495                    result.append('u');
1496                }
1497                result.append(DIGITS[0xF&(c>>12)]);
1498                result.append(DIGITS[0xF&(c>>8)]);
1499                result.append(DIGITS[0xF&(c>>4)]);
1500                result.append(DIGITS[0xF&c]);
1501                return true;
1502            }
1503            return false;
1504        } catch (IOException e) {
1505            throw new IllegalIcuArgumentException(e);
1506        }
1507    }
1508
1509    /**
1510     * Returns the index of the first character in a set, ignoring quoted text.
1511     * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
1512     * found by a search for "h".  Unlike String.indexOf(), this method searches
1513     * not for a single character, but for any character of the string
1514     * <code>setOfChars</code>.
1515     * @param text text to be searched
1516     * @param start the beginning index, inclusive; <code>0 <= start
1517     * <= limit</code>.
1518     * @param limit the ending index, exclusive; <code>start <= limit
1519     * <= text.length()</code>.
1520     * @param setOfChars string with one or more distinct characters
1521     * @return Offset of the first character in <code>setOfChars</code>
1522     * found, or -1 if not found.
1523     * @see String#indexOf
1524     */
1525    public static int quotedIndexOf(String text, int start, int limit,
1526            String setOfChars) {
1527        for (int i=start; i<limit; ++i) {
1528            char c = text.charAt(i);
1529            if (c == BACKSLASH) {
1530                ++i;
1531            } else if (c == APOSTROPHE) {
1532                while (++i < limit
1533                        && text.charAt(i) != APOSTROPHE) {}
1534            } else if (setOfChars.indexOf(c) >= 0) {
1535                return i;
1536            }
1537        }
1538        return -1;
1539    }
1540
1541    /**
1542     * Append a character to a rule that is being built up.  To flush
1543     * the quoteBuf to rule, make one final call with isLiteral == true.
1544     * If there is no final character, pass in (int)-1 as c.
1545     * @param rule the string to append the character to
1546     * @param c the character to append, or (int)-1 if none.
1547     * @param isLiteral if true, then the given character should not be
1548     * quoted or escaped.  Usually this means it is a syntactic element
1549     * such as > or $
1550     * @param escapeUnprintable if true, then unprintable characters
1551     * should be escaped using escapeUnprintable().  These escapes will
1552     * appear outside of quotes.
1553     * @param quoteBuf a buffer which is used to build up quoted
1554     * substrings.  The caller should initially supply an empty buffer,
1555     * and thereafter should not modify the buffer.  The buffer should be
1556     * cleared out by, at the end, calling this method with a literal
1557     * character (which may be -1).
1558     */
1559    public static void appendToRule(StringBuffer rule,
1560            int c,
1561            boolean isLiteral,
1562            boolean escapeUnprintable,
1563            StringBuffer quoteBuf) {
1564        // If we are escaping unprintables, then escape them outside
1565        // quotes.  \\u and \\U are not recognized within quotes.  The same
1566        // logic applies to literals, but literals are never escaped.
1567        if (isLiteral ||
1568                (escapeUnprintable && Utility.isUnprintable(c))) {
1569            if (quoteBuf.length() > 0) {
1570                // We prefer backslash APOSTROPHE to double APOSTROPHE
1571                // (more readable, less similar to ") so if there are
1572                // double APOSTROPHEs at the ends, we pull them outside
1573                // of the quote.
1574
1575                // If the first thing in the quoteBuf is APOSTROPHE
1576                // (doubled) then pull it out.
1577                while (quoteBuf.length() >= 2 &&
1578                        quoteBuf.charAt(0) == APOSTROPHE &&
1579                        quoteBuf.charAt(1) == APOSTROPHE) {
1580                    rule.append(BACKSLASH).append(APOSTROPHE);
1581                    quoteBuf.delete(0, 2);
1582                }
1583                // If the last thing in the quoteBuf is APOSTROPHE
1584                // (doubled) then remove and count it and add it after.
1585                int trailingCount = 0;
1586                while (quoteBuf.length() >= 2 &&
1587                        quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
1588                        quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
1589                    quoteBuf.setLength(quoteBuf.length()-2);
1590                    ++trailingCount;
1591                }
1592                if (quoteBuf.length() > 0) {
1593                    rule.append(APOSTROPHE);
1594                    rule.append(quoteBuf);
1595                    rule.append(APOSTROPHE);
1596                    quoteBuf.setLength(0);
1597                }
1598                while (trailingCount-- > 0) {
1599                    rule.append(BACKSLASH).append(APOSTROPHE);
1600                }
1601            }
1602            if (c != -1) {
1603                /* Since spaces are ignored during parsing, they are
1604                 * emitted only for readability.  We emit one here
1605                 * only if there isn't already one at the end of the
1606                 * rule.
1607                 */
1608                if (c == ' ') {
1609                    int len = rule.length();
1610                    if (len > 0 && rule.charAt(len-1) != ' ') {
1611                        rule.append(' ');
1612                    }
1613                } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) {
1614                    rule.appendCodePoint(c);
1615                }
1616            }
1617        }
1618
1619        // Escape ' and '\' and don't begin a quote just for them
1620        else if (quoteBuf.length() == 0 &&
1621                (c == APOSTROPHE || c == BACKSLASH)) {
1622            rule.append(BACKSLASH).append((char)c);
1623        }
1624
1625        // Specials (printable ascii that isn't [0-9a-zA-Z]) and
1626        // whitespace need quoting.  Also append stuff to quotes if we are
1627        // building up a quoted substring already.
1628        else if (quoteBuf.length() > 0 ||
1629                (c >= 0x0021 && c <= 0x007E &&
1630                        !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
1631                                (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
1632                                (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
1633                                PatternProps.isWhiteSpace(c)) {
1634            quoteBuf.appendCodePoint(c);
1635            // Double ' within a quote
1636            if (c == APOSTROPHE) {
1637                quoteBuf.append((char)c);
1638            }
1639        }
1640
1641        // Otherwise just append
1642        else {
1643            rule.appendCodePoint(c);
1644        }
1645    }
1646
1647    /**
1648     * Append the given string to the rule.  Calls the single-character
1649     * version of appendToRule for each character.
1650     */
1651    public static void appendToRule(StringBuffer rule,
1652            String text,
1653            boolean isLiteral,
1654            boolean escapeUnprintable,
1655            StringBuffer quoteBuf) {
1656        for (int i=0; i<text.length(); ++i) {
1657            // Okay to process in 16-bit code units here
1658            appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
1659        }
1660    }
1661
1662    /**
1663     * Given a matcher reference, which may be null, append its
1664     * pattern as a literal to the given rule.
1665     */
1666    public static void appendToRule(StringBuffer rule,
1667            UnicodeMatcher matcher,
1668            boolean escapeUnprintable,
1669            StringBuffer quoteBuf) {
1670        if (matcher != null) {
1671            appendToRule(rule, matcher.toPattern(escapeUnprintable),
1672                    true, escapeUnprintable, quoteBuf);
1673        }
1674    }
1675
1676    /**
1677     * Compares 2 unsigned integers
1678     * @param source 32 bit unsigned integer
1679     * @param target 32 bit unsigned integer
1680     * @return 0 if equals, 1 if source is greater than target and -1
1681     *         otherwise
1682     */
1683    public static final int compareUnsigned(int source, int target)
1684    {
1685        source += MAGIC_UNSIGNED;
1686        target += MAGIC_UNSIGNED;
1687        if (source < target) {
1688            return -1;
1689        }
1690        else if (source > target) {
1691            return 1;
1692        }
1693        return 0;
1694    }
1695
1696    /**
1697     * Find the highest bit in a positive integer. This is done
1698     * by doing a binary search through the bits.
1699     *
1700     * @param n is the integer
1701     *
1702     * @return the bit number of the highest bit, with 0 being
1703     * the low order bit, or -1 if <code>n</code> is not positive
1704     */
1705    public static final byte highBit(int n)
1706    {
1707        if (n <= 0) {
1708            return -1;
1709        }
1710
1711        byte bit = 0;
1712
1713        if (n >= 1 << 16) {
1714            n >>= 16;
1715        bit += 16;
1716        }
1717
1718        if (n >= 1 << 8) {
1719            n >>= 8;
1720        bit += 8;
1721        }
1722
1723        if (n >= 1 << 4) {
1724            n >>= 4;
1725        bit += 4;
1726        }
1727
1728        if (n >= 1 << 2) {
1729            n >>= 2;
1730        bit += 2;
1731        }
1732
1733        if (n >= 1 << 1) {
1734            n >>= 1;
1735        bit += 1;
1736        }
1737
1738        return bit;
1739    }
1740    /**
1741     * Utility method to take a int[] containing codepoints and return
1742     * a string representation with code units.
1743     */
1744    public static String valueOf(int[]source){
1745        // TODO: Investigate why this method is not on UTF16 class
1746        StringBuilder result = new StringBuilder(source.length);
1747        for(int i=0; i<source.length; i++){
1748            result.appendCodePoint(source[i]);
1749        }
1750        return result.toString();
1751    }
1752
1753
1754    /**
1755     * Utility to duplicate a string count times
1756     * @param s String to be duplicated.
1757     * @param count Number of times to duplicate a string.
1758     */
1759    public static String repeat(String s, int count) {
1760        if (count <= 0) return "";
1761        if (count == 1) return s;
1762        StringBuilder result = new StringBuilder();
1763        for (int i = 0; i < count; ++i) {
1764            result.append(s);
1765        }
1766        return result.toString();
1767    }
1768
1769    public static String[] splitString(String src, String target) {
1770        return src.split("\\Q" + target + "\\E");
1771    }
1772
1773    /**
1774     * Split the string at runs of ascii whitespace characters.
1775     */
1776    public static String[] splitWhitespace(String src) {
1777        return src.split("\\s+");
1778    }
1779
1780    /**
1781     * Parse a list of hex numbers and return a string
1782     * @param string String of hex numbers.
1783     * @param minLength Minimal length.
1784     * @param separator Separator.
1785     * @return A string from hex numbers.
1786     */
1787    public static String fromHex(String string, int minLength, String separator) {
1788        return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+"));
1789    }
1790
1791    /**
1792     * Parse a list of hex numbers and return a string
1793     * @param string String of hex numbers.
1794     * @param minLength Minimal length.
1795     * @param separator Separator.
1796     * @return A string from hex numbers.
1797     */
1798    public static String fromHex(String string, int minLength, Pattern separator) {
1799        StringBuilder buffer = new StringBuilder();
1800        String[] parts = separator.split(string);
1801        for (String part : parts) {
1802            if (part.length() < minLength) {
1803                throw new IllegalArgumentException("code point too short: " + part);
1804            }
1805            int cp = Integer.parseInt(part, 16);
1806            buffer.appendCodePoint(cp);
1807        }
1808        return buffer.toString();
1809    }
1810}
1811