1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/**
4*******************************************************************************
5* Copyright (C) 2002-2004, International Business Machines Corporation and    *
6* others. All Rights Reserved.                                                *
7*******************************************************************************
8*/
9package com.ibm.icu.dev.test;
10
11/**
12 * Utility class for supplementary code point
13 * support. This one is written purely for updating
14 * Normalization sample from the unicode.org site.
15 * If you want the real thing, use UTF16 class
16 * from ICU4J
17 * @author Vladimir Weinstein, Markus Scherer
18 */
19public class UTF16Util {
20    static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
21
22    /**
23     * Method nextCodePoint. Returns the next code point
24     * in a string.
25     * @param s String in question
26     * @param i index from which we want a code point
27     * @return int codepoint at index i
28     */
29    public static final int nextCodePoint(String s, int i) {
30        int ch = s.charAt(i);
31        if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
32            int ch2 = s.charAt(i);
33            if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
34                ch = (ch << 10) + ch2 - suppOffset;
35            }
36        }
37        return ch;
38    }
39
40    /**
41     * Method prevCodePoint. Gets the code point preceding
42     * index i (predecrement).
43     * @param s String in question
44     * @param i index in string
45     * @return int codepoint at index --i
46     */
47    public static final int prevCodePoint(String s, int i) {
48        int ch = s.charAt(--i);
49        if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
50            int ch2 = s.charAt(i);
51            if (0xd800 <= ch2 && ch2 <= 0xdbff) {
52                ch = (ch2 << 10) + ch - suppOffset;
53            }
54        }
55        return ch;
56    }
57
58    /**
59     * Method nextCodePoint. Returns the next code point
60     * in a string.
61     * @param s StringBuffer in question
62     * @param i index from which we want a code point
63     * @return int codepoint at index i
64     */
65    public static final int nextCodePoint(StringBuffer s, int i) {
66        int ch = s.charAt(i);
67        if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
68            int ch2 = s.charAt(i);
69            if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
70                ch = (ch << 10) + ch2 - suppOffset;
71            }
72        }
73        return ch;
74    }
75
76    /**
77     * Method prevCodePoint. Gets the code point preceding
78     * index i (predecrement).
79     * @param s StringBuffer in question
80     * @param i index in string
81     * @return int codepoint at index --i
82     */
83    public static final int prevCodePoint(StringBuffer s, int i) {
84        int ch = s.charAt(--i);
85        if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
86            int ch2 = s.charAt(i);
87            if (0xd800 <= ch2 && ch2 <= 0xdbff) {
88                ch = (ch2 << 10) + ch - suppOffset;
89            }
90        }
91        return ch;
92    }
93
94    /**
95     * Method codePointLength. Returns the length
96     * in UTF-16 code units of a given code point
97     * @param c code point in question
98     * @return int length in UTF-16 code units. Can be 1 or 2
99     */
100    public static final int codePointLength(int c) {
101        return c <= 0xffff ? 1 : 2;
102    }
103
104    /**
105     * Method appendCodePoint. Appends a code point
106     * to a StringBuffer
107     * @param buffer StringBuffer in question
108     * @param ch code point to append
109     */
110    public static final void appendCodePoint(StringBuffer buffer, int ch) {
111        if (ch <= 0xffff) {
112            buffer.append((char)ch);
113        } else {
114            buffer.append((char)(0xd7c0 + (ch >> 10)));
115            buffer.append((char)(0xdc00 + (ch & 0x3ff)));
116        }
117    }
118
119    /**
120     * Method insertCodePoint. Inserts a code point in
121     * a StringBuffer
122     * @param buffer StringBuffer in question
123     * @param i index at which we want code point to be inserted
124     * @param ch code point to be inserted
125     */
126    public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {
127        if (ch <= 0xffff) {
128            buffer.insert(i, (char)ch);
129        } else {
130            buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff)));
131        }
132    }
133
134    /**
135     * Method setCodePointAt. Changes a code point at a
136     * given index. Can change the length of the string.
137     * @param buffer StringBuffer in question
138     * @param i index at which we want to change the contents
139     * @param ch replacement code point
140     * @return int difference in resulting StringBuffer length
141     */
142    public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {
143        int cp = nextCodePoint(buffer, i);
144
145        if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
146            buffer.setCharAt(i, (char)ch);
147            return 0;
148        } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
149            buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
150            buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));
151            return 0;
152        } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
153            buffer.setCharAt(i, (char)ch);
154            buffer.deleteCharAt(i+1);
155            return -1;
156        } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
157            buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
158            buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff)));
159            return 1;
160        }
161    }
162
163    /**
164     * Method countCodePoint. Counts the UTF-32 code points
165     * in a UTF-16 encoded string.
166     * @param source String in question.
167     * @return int number of code points in this string
168     */
169    public static final int countCodePoint(String source)
170    {
171        int result = 0;
172        char ch;
173        boolean hadLeadSurrogate = false;
174
175        for (int i = 0; i < source.length(); ++ i)
176        {
177            ch = source.charAt(i);
178            if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
179                hadLeadSurrogate = false;           // count valid trail as zero
180            }
181            else
182            {
183                hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
184                ++ result;                          // count others as 1
185            }
186        }
187
188        return result;
189    }
190
191    /**
192     * Method countCodePoint. Counts the UTF-32 code points
193     * in a UTF-16 encoded string.
194     * @param source StringBuffer in question.
195     * @return int number of code points in this string
196     */
197    public static final int countCodePoint(StringBuffer source)
198    {
199        int result = 0;
200        char ch;
201        boolean hadLeadSurrogate = false;
202
203        for (int i = 0; i < source.length(); ++ i)
204        {
205            ch = source.charAt(i);
206            if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
207                hadLeadSurrogate = false;           // count valid trail as zero
208            }
209            else
210            {
211                hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
212                ++ result;                          // count others as 1
213            }
214        }
215
216        return result;
217    }
218    /**
219     * The minimum value for Supplementary code points
220     */
221    public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;
222    /**
223     * Determines how many chars this char32 requires.
224     * If a validity check is required, use <code>
225     * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
226     * char32 before calling.
227     * @param char32 the input codepoint.
228     * @return 2 if is in supplementary space, otherwise 1.
229     */
230    public static int getCharCount(int char32)
231    {
232        if (char32 < SUPPLEMENTARY_MIN_VALUE) {
233            return 1;
234        }
235        return 2;
236    }
237    /**
238     * Lead surrogate maximum value
239     * @stable ICU 2.1
240     */
241    public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
242    /**
243     * Lead surrogate minimum value
244     * @stable ICU 2.1
245     */
246    public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
247
248    /**
249     * Trail surrogate minimum value
250     * @stable ICU 2.1
251     */
252    public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
253    /**
254     * Trail surrogate maximum value
255     * @stable ICU 2.1
256     */
257    public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
258    /**
259     * Determines whether the code value is a surrogate.
260     * @param char16 the input character.
261     * @return true iff the input character is a surrogate.
262     * @stable ICU 2.1
263     */
264    public static boolean isSurrogate(char char16)
265    {
266        return LEAD_SURROGATE_MIN_VALUE <= char16 &&
267            char16 <= TRAIL_SURROGATE_MAX_VALUE;
268    }
269
270    /**
271     * Determines whether the character is a trail surrogate.
272     * @param char16 the input character.
273     * @return true iff the input character is a trail surrogate.
274     * @stable ICU 2.1
275     */
276    public static boolean isTrailSurrogate(char char16)
277    {
278        return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
279                char16 <= TRAIL_SURROGATE_MAX_VALUE);
280    }
281
282    /**
283     * Determines whether the character is a lead surrogate.
284     * @param char16 the input character.
285     * @return true iff the input character is a lead surrogate
286     * @stable ICU 2.1
287     */
288    public static boolean isLeadSurrogate(char char16)
289    {
290        return LEAD_SURROGATE_MIN_VALUE <= char16 &&
291            char16 <= LEAD_SURROGATE_MAX_VALUE;
292    }
293    /**
294     * Extract a single UTF-32 value from a substring.
295     * Used when iterating forwards or backwards (with
296     * <code>UTF16.getCharCount()</code>, as well as random access. If a
297     * validity check is required, use
298     * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
299     * </a></code> on the return value.
300     * If the char retrieved is part of a surrogate pair, its supplementary
301     * character will be returned. If a complete supplementary character is
302     * not found the incomplete character will be returned
303     * @param source array of UTF-16 chars
304     * @param start offset to substring in the source array for analyzing
305     * @param limit offset to substring in the source array for analyzing
306     * @param offset16 UTF-16 offset relative to start
307     * @return UTF-32 value for the UTF-32 value that contains the char at
308     *         offset16. The boundaries of that codepoint are the same as in
309     *         <code>bounds32()</code>.
310     * @exception IndexOutOfBoundsException thrown if offset16 is not within
311     *            the range of start and limit.
312     * @stable ICU 2.1
313     */
314    public static int charAt(char source[], int start, int limit,
315                             int offset16)
316    {
317        offset16 += start;
318        if (offset16 < start || offset16 >= limit) {
319            throw new ArrayIndexOutOfBoundsException(offset16);
320        }
321
322        char single = source[offset16];
323        if (!isSurrogate(single)) {
324            return single;
325        }
326
327        // Convert the UTF-16 surrogate pair if necessary.
328        // For simplicity in usage, and because the frequency of pairs is
329        // low, look both directions.
330        if (single <= LEAD_SURROGATE_MAX_VALUE) {
331            offset16 ++;
332            if (offset16 >= limit) {
333                return single;
334            }
335            char trail = source[offset16];
336            if (isTrailSurrogate(trail)) {
337                return getRawSupplementary(single, trail);
338            }
339        }
340        else { // isTrailSurrogate(single), so
341            if (offset16 == start) {
342                return single;
343            }
344            offset16 --;
345            char lead = source[offset16];
346            if (isLeadSurrogate(lead))
347                return getRawSupplementary(lead, single);
348        }
349        return single; // return unmatched surrogate
350    }
351    /**
352     * Shift value for lead surrogate to form a supplementary character.
353     */
354    private static final int LEAD_SURROGATE_SHIFT_ = 10;
355
356    /**
357     * Offset to add to combined surrogate pair to avoid msking.
358     */
359    private static final int SURROGATE_OFFSET_ =
360                           SUPPLEMENTARY_MIN_VALUE -
361                           (LEAD_SURROGATE_MIN_VALUE <<
362                           LEAD_SURROGATE_SHIFT_) -
363                           TRAIL_SURROGATE_MIN_VALUE;
364
365
366   /**
367    * Forms a supplementary code point from the argument character<br>
368    * Note this is for internal use hence no checks for the validity of the
369    * surrogate characters are done
370    * @param lead lead surrogate character
371    * @param trail trailing surrogate character
372    * @return code point of the supplementary character
373    */
374    public static int getRawSupplementary(char lead, char trail)
375    {
376        return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
377    }
378
379}
380