1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/** 4******************************************************************************* 5* Copyright (C) 2002-2004, International Business Machines Corporation and * 6* others. All Rights Reserved. * 7******************************************************************************* 8*/ 9package com.ibm.icu.dev.test; 10 11/** 12 * Utility class for supplementary code point 13 * support. This one is written purely for updating 14 * Normalization sample from the unicode.org site. 15 * If you want the real thing, use UTF16 class 16 * from ICU4J 17 * @author Vladimir Weinstein, Markus Scherer 18 */ 19public class UTF16Util { 20 static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000; 21 22 /** 23 * Method nextCodePoint. Returns the next code point 24 * in a string. 25 * @param s String in question 26 * @param i index from which we want a code point 27 * @return int codepoint at index i 28 */ 29 public static final int nextCodePoint(String s, int i) { 30 int ch = s.charAt(i); 31 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { 32 int ch2 = s.charAt(i); 33 if (0xdc00 <= ch2 && ch2 <= 0xdfff) { 34 ch = (ch << 10) + ch2 - suppOffset; 35 } 36 } 37 return ch; 38 } 39 40 /** 41 * Method prevCodePoint. Gets the code point preceding 42 * index i (predecrement). 43 * @param s String in question 44 * @param i index in string 45 * @return int codepoint at index --i 46 */ 47 public static final int prevCodePoint(String s, int i) { 48 int ch = s.charAt(--i); 49 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { 50 int ch2 = s.charAt(i); 51 if (0xd800 <= ch2 && ch2 <= 0xdbff) { 52 ch = (ch2 << 10) + ch - suppOffset; 53 } 54 } 55 return ch; 56 } 57 58 /** 59 * Method nextCodePoint. Returns the next code point 60 * in a string. 61 * @param s StringBuffer in question 62 * @param i index from which we want a code point 63 * @return int codepoint at index i 64 */ 65 public static final int nextCodePoint(StringBuffer s, int i) { 66 int ch = s.charAt(i); 67 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { 68 int ch2 = s.charAt(i); 69 if (0xdc00 <= ch2 && ch2 <= 0xdfff) { 70 ch = (ch << 10) + ch2 - suppOffset; 71 } 72 } 73 return ch; 74 } 75 76 /** 77 * Method prevCodePoint. Gets the code point preceding 78 * index i (predecrement). 79 * @param s StringBuffer in question 80 * @param i index in string 81 * @return int codepoint at index --i 82 */ 83 public static final int prevCodePoint(StringBuffer s, int i) { 84 int ch = s.charAt(--i); 85 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { 86 int ch2 = s.charAt(i); 87 if (0xd800 <= ch2 && ch2 <= 0xdbff) { 88 ch = (ch2 << 10) + ch - suppOffset; 89 } 90 } 91 return ch; 92 } 93 94 /** 95 * Method codePointLength. Returns the length 96 * in UTF-16 code units of a given code point 97 * @param c code point in question 98 * @return int length in UTF-16 code units. Can be 1 or 2 99 */ 100 public static final int codePointLength(int c) { 101 return c <= 0xffff ? 1 : 2; 102 } 103 104 /** 105 * Method appendCodePoint. Appends a code point 106 * to a StringBuffer 107 * @param buffer StringBuffer in question 108 * @param ch code point to append 109 */ 110 public static final void appendCodePoint(StringBuffer buffer, int ch) { 111 if (ch <= 0xffff) { 112 buffer.append((char)ch); 113 } else { 114 buffer.append((char)(0xd7c0 + (ch >> 10))); 115 buffer.append((char)(0xdc00 + (ch & 0x3ff))); 116 } 117 } 118 119 /** 120 * Method insertCodePoint. Inserts a code point in 121 * a StringBuffer 122 * @param buffer StringBuffer in question 123 * @param i index at which we want code point to be inserted 124 * @param ch code point to be inserted 125 */ 126 public static final void insertCodePoint(StringBuffer buffer, int i, int ch) { 127 if (ch <= 0xffff) { 128 buffer.insert(i, (char)ch); 129 } else { 130 buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff))); 131 } 132 } 133 134 /** 135 * Method setCodePointAt. Changes a code point at a 136 * given index. Can change the length of the string. 137 * @param buffer StringBuffer in question 138 * @param i index at which we want to change the contents 139 * @param ch replacement code point 140 * @return int difference in resulting StringBuffer length 141 */ 142 public static final int setCodePointAt(StringBuffer buffer, int i, int ch) { 143 int cp = nextCodePoint(buffer, i); 144 145 if (ch <= 0xffff && cp <= 0xffff) { // Both BMP 146 buffer.setCharAt(i, (char)ch); 147 return 0; 148 } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary 149 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); 150 buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff))); 151 return 0; 152 } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks 153 buffer.setCharAt(i, (char)ch); 154 buffer.deleteCharAt(i+1); 155 return -1; 156 } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows 157 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); 158 buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff))); 159 return 1; 160 } 161 } 162 163 /** 164 * Method countCodePoint. Counts the UTF-32 code points 165 * in a UTF-16 encoded string. 166 * @param source String in question. 167 * @return int number of code points in this string 168 */ 169 public static final int countCodePoint(String source) 170 { 171 int result = 0; 172 char ch; 173 boolean hadLeadSurrogate = false; 174 175 for (int i = 0; i < source.length(); ++ i) 176 { 177 ch = source.charAt(i); 178 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { 179 hadLeadSurrogate = false; // count valid trail as zero 180 } 181 else 182 { 183 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); 184 ++ result; // count others as 1 185 } 186 } 187 188 return result; 189 } 190 191 /** 192 * Method countCodePoint. Counts the UTF-32 code points 193 * in a UTF-16 encoded string. 194 * @param source StringBuffer in question. 195 * @return int number of code points in this string 196 */ 197 public static final int countCodePoint(StringBuffer source) 198 { 199 int result = 0; 200 char ch; 201 boolean hadLeadSurrogate = false; 202 203 for (int i = 0; i < source.length(); ++ i) 204 { 205 ch = source.charAt(i); 206 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { 207 hadLeadSurrogate = false; // count valid trail as zero 208 } 209 else 210 { 211 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); 212 ++ result; // count others as 1 213 } 214 } 215 216 return result; 217 } 218 /** 219 * The minimum value for Supplementary code points 220 */ 221 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 222 /** 223 * Determines how many chars this char32 requires. 224 * If a validity check is required, use <code> 225 * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on 226 * char32 before calling. 227 * @param char32 the input codepoint. 228 * @return 2 if is in supplementary space, otherwise 1. 229 */ 230 public static int getCharCount(int char32) 231 { 232 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 233 return 1; 234 } 235 return 2; 236 } 237 /** 238 * Lead surrogate maximum value 239 * @stable ICU 2.1 240 */ 241 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 242 /** 243 * Lead surrogate minimum value 244 * @stable ICU 2.1 245 */ 246 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 247 248 /** 249 * Trail surrogate minimum value 250 * @stable ICU 2.1 251 */ 252 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 253 /** 254 * Trail surrogate maximum value 255 * @stable ICU 2.1 256 */ 257 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 258 /** 259 * Determines whether the code value is a surrogate. 260 * @param char16 the input character. 261 * @return true iff the input character is a surrogate. 262 * @stable ICU 2.1 263 */ 264 public static boolean isSurrogate(char char16) 265 { 266 return LEAD_SURROGATE_MIN_VALUE <= char16 && 267 char16 <= TRAIL_SURROGATE_MAX_VALUE; 268 } 269 270 /** 271 * Determines whether the character is a trail surrogate. 272 * @param char16 the input character. 273 * @return true iff the input character is a trail surrogate. 274 * @stable ICU 2.1 275 */ 276 public static boolean isTrailSurrogate(char char16) 277 { 278 return (TRAIL_SURROGATE_MIN_VALUE <= char16 && 279 char16 <= TRAIL_SURROGATE_MAX_VALUE); 280 } 281 282 /** 283 * Determines whether the character is a lead surrogate. 284 * @param char16 the input character. 285 * @return true iff the input character is a lead surrogate 286 * @stable ICU 2.1 287 */ 288 public static boolean isLeadSurrogate(char char16) 289 { 290 return LEAD_SURROGATE_MIN_VALUE <= char16 && 291 char16 <= LEAD_SURROGATE_MAX_VALUE; 292 } 293 /** 294 * Extract a single UTF-32 value from a substring. 295 * Used when iterating forwards or backwards (with 296 * <code>UTF16.getCharCount()</code>, as well as random access. If a 297 * validity check is required, use 298 * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal() 299 * </a></code> on the return value. 300 * If the char retrieved is part of a surrogate pair, its supplementary 301 * character will be returned. If a complete supplementary character is 302 * not found the incomplete character will be returned 303 * @param source array of UTF-16 chars 304 * @param start offset to substring in the source array for analyzing 305 * @param limit offset to substring in the source array for analyzing 306 * @param offset16 UTF-16 offset relative to start 307 * @return UTF-32 value for the UTF-32 value that contains the char at 308 * offset16. The boundaries of that codepoint are the same as in 309 * <code>bounds32()</code>. 310 * @exception IndexOutOfBoundsException thrown if offset16 is not within 311 * the range of start and limit. 312 * @stable ICU 2.1 313 */ 314 public static int charAt(char source[], int start, int limit, 315 int offset16) 316 { 317 offset16 += start; 318 if (offset16 < start || offset16 >= limit) { 319 throw new ArrayIndexOutOfBoundsException(offset16); 320 } 321 322 char single = source[offset16]; 323 if (!isSurrogate(single)) { 324 return single; 325 } 326 327 // Convert the UTF-16 surrogate pair if necessary. 328 // For simplicity in usage, and because the frequency of pairs is 329 // low, look both directions. 330 if (single <= LEAD_SURROGATE_MAX_VALUE) { 331 offset16 ++; 332 if (offset16 >= limit) { 333 return single; 334 } 335 char trail = source[offset16]; 336 if (isTrailSurrogate(trail)) { 337 return getRawSupplementary(single, trail); 338 } 339 } 340 else { // isTrailSurrogate(single), so 341 if (offset16 == start) { 342 return single; 343 } 344 offset16 --; 345 char lead = source[offset16]; 346 if (isLeadSurrogate(lead)) 347 return getRawSupplementary(lead, single); 348 } 349 return single; // return unmatched surrogate 350 } 351 /** 352 * Shift value for lead surrogate to form a supplementary character. 353 */ 354 private static final int LEAD_SURROGATE_SHIFT_ = 10; 355 356 /** 357 * Offset to add to combined surrogate pair to avoid msking. 358 */ 359 private static final int SURROGATE_OFFSET_ = 360 SUPPLEMENTARY_MIN_VALUE - 361 (LEAD_SURROGATE_MIN_VALUE << 362 LEAD_SURROGATE_SHIFT_) - 363 TRAIL_SURROGATE_MIN_VALUE; 364 365 366 /** 367 * Forms a supplementary code point from the argument character<br> 368 * Note this is for internal use hence no checks for the validity of the 369 * surrogate characters are done 370 * @param lead lead surrogate character 371 * @param trail trailing surrogate character 372 * @return code point of the supplementary character 373 */ 374 public static int getRawSupplementary(char lead, char trail) 375 { 376 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 377 } 378 379} 380