1/** 2******************************************************************************* 3* Copyright (C) 2002-2004, International Business Machines Corporation and * 4* others. All Rights Reserved. * 5******************************************************************************* 6*/ 7package com.ibm.icu.dev.test; 8 9/** 10 * Utility class for supplementary code point 11 * support. This one is written purely for updating 12 * Normalization sample from the unicode.org site. 13 * If you want the real thing, use UTF16 class 14 * from ICU4J 15 * @author Vladimir Weinstein, Markus Scherer 16 */ 17public class UTF16Util { 18 static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000; 19 20 /** 21 * Method nextCodePoint. Returns the next code point 22 * in a string. 23 * @param s String in question 24 * @param i index from which we want a code point 25 * @return int codepoint at index i 26 */ 27 public static final int nextCodePoint(String s, int i) { 28 int ch = s.charAt(i); 29 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { 30 int ch2 = s.charAt(i); 31 if (0xdc00 <= ch2 && ch2 <= 0xdfff) { 32 ch = (ch << 10) + ch2 - suppOffset; 33 } 34 } 35 return ch; 36 } 37 38 /** 39 * Method prevCodePoint. Gets the code point preceding 40 * index i (predecrement). 41 * @param s String in question 42 * @param i index in string 43 * @return int codepoint at index --i 44 */ 45 public static final int prevCodePoint(String s, int i) { 46 int ch = s.charAt(--i); 47 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { 48 int ch2 = s.charAt(i); 49 if (0xd800 <= ch2 && ch2 <= 0xdbff) { 50 ch = (ch2 << 10) + ch - suppOffset; 51 } 52 } 53 return ch; 54 } 55 56 /** 57 * Method nextCodePoint. Returns the next code point 58 * in a string. 59 * @param s StringBuffer in question 60 * @param i index from which we want a code point 61 * @return int codepoint at index i 62 */ 63 public static final int nextCodePoint(StringBuffer s, int i) { 64 int ch = s.charAt(i); 65 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { 66 int ch2 = s.charAt(i); 67 if (0xdc00 <= ch2 && ch2 <= 0xdfff) { 68 ch = (ch << 10) + ch2 - suppOffset; 69 } 70 } 71 return ch; 72 } 73 74 /** 75 * Method prevCodePoint. Gets the code point preceding 76 * index i (predecrement). 77 * @param s StringBuffer in question 78 * @param i index in string 79 * @return int codepoint at index --i 80 */ 81 public static final int prevCodePoint(StringBuffer s, int i) { 82 int ch = s.charAt(--i); 83 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { 84 int ch2 = s.charAt(i); 85 if (0xd800 <= ch2 && ch2 <= 0xdbff) { 86 ch = (ch2 << 10) + ch - suppOffset; 87 } 88 } 89 return ch; 90 } 91 92 /** 93 * Method codePointLength. Returns the length 94 * in UTF-16 code units of a given code point 95 * @param c code point in question 96 * @return int length in UTF-16 code units. Can be 1 or 2 97 */ 98 public static final int codePointLength(int c) { 99 return c <= 0xffff ? 1 : 2; 100 } 101 102 /** 103 * Method appendCodePoint. Appends a code point 104 * to a StringBuffer 105 * @param buffer StringBuffer in question 106 * @param ch code point to append 107 */ 108 public static final void appendCodePoint(StringBuffer buffer, int ch) { 109 if (ch <= 0xffff) { 110 buffer.append((char)ch); 111 } else { 112 buffer.append((char)(0xd7c0 + (ch >> 10))); 113 buffer.append((char)(0xdc00 + (ch & 0x3ff))); 114 } 115 } 116 117 /** 118 * Method insertCodePoint. Inserts a code point in 119 * a StringBuffer 120 * @param buffer StringBuffer in question 121 * @param i index at which we want code point to be inserted 122 * @param ch code point to be inserted 123 */ 124 public static final void insertCodePoint(StringBuffer buffer, int i, int ch) { 125 if (ch <= 0xffff) { 126 buffer.insert(i, (char)ch); 127 } else { 128 buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff))); 129 } 130 } 131 132 /** 133 * Method setCodePointAt. Changes a code point at a 134 * given index. Can change the length of the string. 135 * @param buffer StringBuffer in question 136 * @param i index at which we want to change the contents 137 * @param ch replacement code point 138 * @return int difference in resulting StringBuffer length 139 */ 140 public static final int setCodePointAt(StringBuffer buffer, int i, int ch) { 141 int cp = nextCodePoint(buffer, i); 142 143 if (ch <= 0xffff && cp <= 0xffff) { // Both BMP 144 buffer.setCharAt(i, (char)ch); 145 return 0; 146 } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary 147 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); 148 buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff))); 149 return 0; 150 } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks 151 buffer.setCharAt(i, (char)ch); 152 buffer.deleteCharAt(i+1); 153 return -1; 154 } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows 155 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); 156 buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff))); 157 return 1; 158 } 159 } 160 161 /** 162 * Method countCodePoint. Counts the UTF-32 code points 163 * in a UTF-16 encoded string. 164 * @param source String in question. 165 * @return int number of code points in this string 166 */ 167 public static final int countCodePoint(String source) 168 { 169 int result = 0; 170 char ch; 171 boolean hadLeadSurrogate = false; 172 173 for (int i = 0; i < source.length(); ++ i) 174 { 175 ch = source.charAt(i); 176 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { 177 hadLeadSurrogate = false; // count valid trail as zero 178 } 179 else 180 { 181 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); 182 ++ result; // count others as 1 183 } 184 } 185 186 return result; 187 } 188 189 /** 190 * Method countCodePoint. Counts the UTF-32 code points 191 * in a UTF-16 encoded string. 192 * @param source StringBuffer in question. 193 * @return int number of code points in this string 194 */ 195 public static final int countCodePoint(StringBuffer source) 196 { 197 int result = 0; 198 char ch; 199 boolean hadLeadSurrogate = false; 200 201 for (int i = 0; i < source.length(); ++ i) 202 { 203 ch = source.charAt(i); 204 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { 205 hadLeadSurrogate = false; // count valid trail as zero 206 } 207 else 208 { 209 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); 210 ++ result; // count others as 1 211 } 212 } 213 214 return result; 215 } 216 /** 217 * The minimum value for Supplementary code points 218 */ 219 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 220 /** 221 * Determines how many chars this char32 requires. 222 * If a validity check is required, use <code> 223 * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on 224 * char32 before calling. 225 * @param char32 the input codepoint. 226 * @return 2 if is in supplementary space, otherwise 1. 227 */ 228 public static int getCharCount(int char32) 229 { 230 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 231 return 1; 232 } 233 return 2; 234 } 235 /** 236 * Lead surrogate maximum value 237 * @stable ICU 2.1 238 */ 239 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 240 /** 241 * Lead surrogate minimum value 242 * @stable ICU 2.1 243 */ 244 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 245 246 /** 247 * Trail surrogate minimum value 248 * @stable ICU 2.1 249 */ 250 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 251 /** 252 * Trail surrogate maximum value 253 * @stable ICU 2.1 254 */ 255 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 256 /** 257 * Determines whether the code value is a surrogate. 258 * @param char16 the input character. 259 * @return true iff the input character is a surrogate. 260 * @stable ICU 2.1 261 */ 262 public static boolean isSurrogate(char char16) 263 { 264 return LEAD_SURROGATE_MIN_VALUE <= char16 && 265 char16 <= TRAIL_SURROGATE_MAX_VALUE; 266 } 267 268 /** 269 * Determines whether the character is a trail surrogate. 270 * @param char16 the input character. 271 * @return true iff the input character is a trail surrogate. 272 * @stable ICU 2.1 273 */ 274 public static boolean isTrailSurrogate(char char16) 275 { 276 return (TRAIL_SURROGATE_MIN_VALUE <= char16 && 277 char16 <= TRAIL_SURROGATE_MAX_VALUE); 278 } 279 280 /** 281 * Determines whether the character is a lead surrogate. 282 * @param char16 the input character. 283 * @return true iff the input character is a lead surrogate 284 * @stable ICU 2.1 285 */ 286 public static boolean isLeadSurrogate(char char16) 287 { 288 return LEAD_SURROGATE_MIN_VALUE <= char16 && 289 char16 <= LEAD_SURROGATE_MAX_VALUE; 290 } 291 /** 292 * Extract a single UTF-32 value from a substring. 293 * Used when iterating forwards or backwards (with 294 * <code>UTF16.getCharCount()</code>, as well as random access. If a 295 * validity check is required, use 296 * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal() 297 * </a></code> on the return value. 298 * If the char retrieved is part of a surrogate pair, its supplementary 299 * character will be returned. If a complete supplementary character is 300 * not found the incomplete character will be returned 301 * @param source array of UTF-16 chars 302 * @param start offset to substring in the source array for analyzing 303 * @param limit offset to substring in the source array for analyzing 304 * @param offset16 UTF-16 offset relative to start 305 * @return UTF-32 value for the UTF-32 value that contains the char at 306 * offset16. The boundaries of that codepoint are the same as in 307 * <code>bounds32()</code>. 308 * @exception IndexOutOfBoundsException thrown if offset16 is not within 309 * the range of start and limit. 310 * @stable ICU 2.1 311 */ 312 public static int charAt(char source[], int start, int limit, 313 int offset16) 314 { 315 offset16 += start; 316 if (offset16 < start || offset16 >= limit) { 317 throw new ArrayIndexOutOfBoundsException(offset16); 318 } 319 320 char single = source[offset16]; 321 if (!isSurrogate(single)) { 322 return single; 323 } 324 325 // Convert the UTF-16 surrogate pair if necessary. 326 // For simplicity in usage, and because the frequency of pairs is 327 // low, look both directions. 328 if (single <= LEAD_SURROGATE_MAX_VALUE) { 329 offset16 ++; 330 if (offset16 >= limit) { 331 return single; 332 } 333 char trail = source[offset16]; 334 if (isTrailSurrogate(trail)) { 335 return getRawSupplementary(single, trail); 336 } 337 } 338 else { // isTrailSurrogate(single), so 339 if (offset16 == start) { 340 return single; 341 } 342 offset16 --; 343 char lead = source[offset16]; 344 if (isLeadSurrogate(lead)) 345 return getRawSupplementary(lead, single); 346 } 347 return single; // return unmatched surrogate 348 } 349 /** 350 * Shift value for lead surrogate to form a supplementary character. 351 */ 352 private static final int LEAD_SURROGATE_SHIFT_ = 10; 353 354 /** 355 * Offset to add to combined surrogate pair to avoid msking. 356 */ 357 private static final int SURROGATE_OFFSET_ = 358 SUPPLEMENTARY_MIN_VALUE - 359 (LEAD_SURROGATE_MIN_VALUE << 360 LEAD_SURROGATE_SHIFT_) - 361 TRAIL_SURROGATE_MIN_VALUE; 362 363 364 /** 365 * Forms a supplementary code point from the argument character<br> 366 * Note this is for internal use hence no checks for the validity of the 367 * surrogate characters are done 368 * @param lead lead surrogate character 369 * @param trail trailing surrogate character 370 * @return code point of the supplementary character 371 */ 372 public static int getRawSupplementary(char lead, char trail) 373 { 374 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 375 } 376 377} 378