1/**
2*******************************************************************************
3* Copyright (C) 2002-2004, International Business Machines Corporation and    *
4* others. All Rights Reserved.                                                *
5*******************************************************************************
6*/
7package com.ibm.icu.dev.test;
8
9/**
10 * Utility class for supplementary code point
11 * support. This one is written purely for updating
12 * Normalization sample from the unicode.org site.
13 * If you want the real thing, use UTF16 class
14 * from ICU4J
15 * @author Vladimir Weinstein, Markus Scherer
16 */
17public class UTF16Util {
18    static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
19
20    /**
21     * Method nextCodePoint. Returns the next code point
22     * in a string.
23     * @param s String in question
24     * @param i index from which we want a code point
25     * @return int codepoint at index i
26     */
27    public static final int nextCodePoint(String s, int i) {
28        int ch = s.charAt(i);
29        if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
30            int ch2 = s.charAt(i);
31            if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
32                ch = (ch << 10) + ch2 - suppOffset;
33            }
34        }
35        return ch;
36    }
37
38    /**
39     * Method prevCodePoint. Gets the code point preceding
40     * index i (predecrement).
41     * @param s String in question
42     * @param i index in string
43     * @return int codepoint at index --i
44     */
45    public static final int prevCodePoint(String s, int i) {
46        int ch = s.charAt(--i);
47        if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
48            int ch2 = s.charAt(i);
49            if (0xd800 <= ch2 && ch2 <= 0xdbff) {
50                ch = (ch2 << 10) + ch - suppOffset;
51            }
52        }
53        return ch;
54    }
55
56    /**
57     * Method nextCodePoint. Returns the next code point
58     * in a string.
59     * @param s StringBuffer in question
60     * @param i index from which we want a code point
61     * @return int codepoint at index i
62     */
63    public static final int nextCodePoint(StringBuffer s, int i) {
64        int ch = s.charAt(i);
65        if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
66            int ch2 = s.charAt(i);
67            if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
68                ch = (ch << 10) + ch2 - suppOffset;
69            }
70        }
71        return ch;
72    }
73
74    /**
75     * Method prevCodePoint. Gets the code point preceding
76     * index i (predecrement).
77     * @param s StringBuffer in question
78     * @param i index in string
79     * @return int codepoint at index --i
80     */
81    public static final int prevCodePoint(StringBuffer s, int i) {
82        int ch = s.charAt(--i);
83        if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
84            int ch2 = s.charAt(i);
85            if (0xd800 <= ch2 && ch2 <= 0xdbff) {
86                ch = (ch2 << 10) + ch - suppOffset;
87            }
88        }
89        return ch;
90    }
91
92    /**
93     * Method codePointLength. Returns the length
94     * in UTF-16 code units of a given code point
95     * @param c code point in question
96     * @return int length in UTF-16 code units. Can be 1 or 2
97     */
98    public static final int codePointLength(int c) {
99        return c <= 0xffff ? 1 : 2;
100    }
101
102    /**
103     * Method appendCodePoint. Appends a code point
104     * to a StringBuffer
105     * @param buffer StringBuffer in question
106     * @param ch code point to append
107     */
108    public static final void appendCodePoint(StringBuffer buffer, int ch) {
109        if (ch <= 0xffff) {
110            buffer.append((char)ch);
111        } else {
112            buffer.append((char)(0xd7c0 + (ch >> 10)));
113            buffer.append((char)(0xdc00 + (ch & 0x3ff)));
114        }
115    }
116
117    /**
118     * Method insertCodePoint. Inserts a code point in
119     * a StringBuffer
120     * @param buffer StringBuffer in question
121     * @param i index at which we want code point to be inserted
122     * @param ch code point to be inserted
123     */
124    public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {
125        if (ch <= 0xffff) {
126            buffer.insert(i, (char)ch);
127        } else {
128            buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff)));
129        }
130    }
131
132    /**
133     * Method setCodePointAt. Changes a code point at a
134     * given index. Can change the length of the string.
135     * @param buffer StringBuffer in question
136     * @param i index at which we want to change the contents
137     * @param ch replacement code point
138     * @return int difference in resulting StringBuffer length
139     */
140    public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {
141        int cp = nextCodePoint(buffer, i);
142
143        if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
144            buffer.setCharAt(i, (char)ch);
145            return 0;
146        } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
147            buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
148            buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));
149            return 0;
150        } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
151            buffer.setCharAt(i, (char)ch);
152            buffer.deleteCharAt(i+1);
153            return -1;
154        } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
155            buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
156            buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff)));
157            return 1;
158        }
159    }
160
161    /**
162     * Method countCodePoint. Counts the UTF-32 code points
163     * in a UTF-16 encoded string.
164     * @param source String in question.
165     * @return int number of code points in this string
166     */
167    public static final int countCodePoint(String source)
168    {
169        int result = 0;
170        char ch;
171        boolean hadLeadSurrogate = false;
172
173        for (int i = 0; i < source.length(); ++ i)
174        {
175            ch = source.charAt(i);
176            if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
177                hadLeadSurrogate = false;           // count valid trail as zero
178            }
179            else
180            {
181                hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
182                ++ result;                          // count others as 1
183            }
184        }
185
186        return result;
187    }
188
189    /**
190     * Method countCodePoint. Counts the UTF-32 code points
191     * in a UTF-16 encoded string.
192     * @param source StringBuffer in question.
193     * @return int number of code points in this string
194     */
195    public static final int countCodePoint(StringBuffer source)
196    {
197        int result = 0;
198        char ch;
199        boolean hadLeadSurrogate = false;
200
201        for (int i = 0; i < source.length(); ++ i)
202        {
203            ch = source.charAt(i);
204            if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
205                hadLeadSurrogate = false;           // count valid trail as zero
206            }
207            else
208            {
209                hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
210                ++ result;                          // count others as 1
211            }
212        }
213
214        return result;
215    }
216    /**
217     * The minimum value for Supplementary code points
218     */
219    public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;
220    /**
221     * Determines how many chars this char32 requires.
222     * If a validity check is required, use <code>
223     * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
224     * char32 before calling.
225     * @param char32 the input codepoint.
226     * @return 2 if is in supplementary space, otherwise 1.
227     */
228    public static int getCharCount(int char32)
229    {
230        if (char32 < SUPPLEMENTARY_MIN_VALUE) {
231            return 1;
232        }
233        return 2;
234    }
235    /**
236     * Lead surrogate maximum value
237     * @stable ICU 2.1
238     */
239    public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
240    /**
241     * Lead surrogate minimum value
242     * @stable ICU 2.1
243     */
244    public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
245
246    /**
247     * Trail surrogate minimum value
248     * @stable ICU 2.1
249     */
250    public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
251    /**
252     * Trail surrogate maximum value
253     * @stable ICU 2.1
254     */
255    public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
256    /**
257     * Determines whether the code value is a surrogate.
258     * @param char16 the input character.
259     * @return true iff the input character is a surrogate.
260     * @stable ICU 2.1
261     */
262    public static boolean isSurrogate(char char16)
263    {
264        return LEAD_SURROGATE_MIN_VALUE <= char16 &&
265            char16 <= TRAIL_SURROGATE_MAX_VALUE;
266    }
267
268    /**
269     * Determines whether the character is a trail surrogate.
270     * @param char16 the input character.
271     * @return true iff the input character is a trail surrogate.
272     * @stable ICU 2.1
273     */
274    public static boolean isTrailSurrogate(char char16)
275    {
276        return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
277                char16 <= TRAIL_SURROGATE_MAX_VALUE);
278    }
279
280    /**
281     * Determines whether the character is a lead surrogate.
282     * @param char16 the input character.
283     * @return true iff the input character is a lead surrogate
284     * @stable ICU 2.1
285     */
286    public static boolean isLeadSurrogate(char char16)
287    {
288        return LEAD_SURROGATE_MIN_VALUE <= char16 &&
289            char16 <= LEAD_SURROGATE_MAX_VALUE;
290    }
291    /**
292     * Extract a single UTF-32 value from a substring.
293     * Used when iterating forwards or backwards (with
294     * <code>UTF16.getCharCount()</code>, as well as random access. If a
295     * validity check is required, use
296     * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
297     * </a></code> on the return value.
298     * If the char retrieved is part of a surrogate pair, its supplementary
299     * character will be returned. If a complete supplementary character is
300     * not found the incomplete character will be returned
301     * @param source array of UTF-16 chars
302     * @param start offset to substring in the source array for analyzing
303     * @param limit offset to substring in the source array for analyzing
304     * @param offset16 UTF-16 offset relative to start
305     * @return UTF-32 value for the UTF-32 value that contains the char at
306     *         offset16. The boundaries of that codepoint are the same as in
307     *         <code>bounds32()</code>.
308     * @exception IndexOutOfBoundsException thrown if offset16 is not within
309     *            the range of start and limit.
310     * @stable ICU 2.1
311     */
312    public static int charAt(char source[], int start, int limit,
313                             int offset16)
314    {
315        offset16 += start;
316        if (offset16 < start || offset16 >= limit) {
317            throw new ArrayIndexOutOfBoundsException(offset16);
318        }
319
320        char single = source[offset16];
321        if (!isSurrogate(single)) {
322            return single;
323        }
324
325        // Convert the UTF-16 surrogate pair if necessary.
326        // For simplicity in usage, and because the frequency of pairs is
327        // low, look both directions.
328        if (single <= LEAD_SURROGATE_MAX_VALUE) {
329            offset16 ++;
330            if (offset16 >= limit) {
331                return single;
332            }
333            char trail = source[offset16];
334            if (isTrailSurrogate(trail)) {
335                return getRawSupplementary(single, trail);
336            }
337        }
338        else { // isTrailSurrogate(single), so
339            if (offset16 == start) {
340                return single;
341            }
342            offset16 --;
343            char lead = source[offset16];
344            if (isLeadSurrogate(lead))
345                return getRawSupplementary(lead, single);
346        }
347        return single; // return unmatched surrogate
348    }
349    /**
350     * Shift value for lead surrogate to form a supplementary character.
351     */
352    private static final int LEAD_SURROGATE_SHIFT_ = 10;
353
354    /**
355     * Offset to add to combined surrogate pair to avoid msking.
356     */
357    private static final int SURROGATE_OFFSET_ =
358                           SUPPLEMENTARY_MIN_VALUE -
359                           (LEAD_SURROGATE_MIN_VALUE <<
360                           LEAD_SURROGATE_SHIFT_) -
361                           TRAIL_SURROGATE_MIN_VALUE;
362
363
364   /**
365    * Forms a supplementary code point from the argument character<br>
366    * Note this is for internal use hence no checks for the validity of the
367    * surrogate characters are done
368    * @param lead lead surrogate character
369    * @param trail trailing surrogate character
370    * @return code point of the supplementary character
371    */
372    public static int getRawSupplementary(char lead, char trail)
373    {
374        return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
375    }
376
377}
378