hb-icu.cc revision 378d279bbf692195c4654e312dae854ab3be04cf
1e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger/* 2e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Copyright © 2009 Red Hat, Inc. 3e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Copyright © 2009 Keith Stribley 4e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Copyright © 2011 Google, Inc. 5e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * 6e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * This is part of HarfBuzz, a text shaping library. 7e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * 8e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Permission is hereby granted, without written agreement and without 9e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * license or royalty fees, to use, copy, modify, and distribute this 10e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * software and its documentation for any purpose, provided that the 11e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * above copyright notice and the following two paragraphs appear in 12e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * all copies of this software. 13e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * 14e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 15e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 16e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 17e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 18e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * DAMAGE. 19e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * 20 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 23 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 24 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 25 * 26 * Red Hat Author(s): Behdad Esfahbod 27 * Google Author(s): Behdad Esfahbod 28 */ 29 30#include "hb-private.hh" 31 32#include "hb-icu.h" 33 34#include "hb-unicode-private.hh" 35 36#include <unicode/uversion.h> 37#include <unicode/uchar.h> 38#include <unicode/unorm.h> 39#include <unicode/ustring.h> 40 41 42 43hb_script_t 44hb_icu_script_to_script (UScriptCode script) 45{ 46 if (unlikely (script == USCRIPT_INVALID_CODE)) 47 return HB_SCRIPT_INVALID; 48 49 return hb_script_from_string (uscript_getShortName (script), -1); 50} 51 52UScriptCode 53hb_icu_script_from_script (hb_script_t script) 54{ 55 if (unlikely (script == HB_SCRIPT_INVALID)) 56 return USCRIPT_INVALID_CODE; 57 58 for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++) 59 if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script)) 60 return (UScriptCode) i; 61 62 return USCRIPT_UNKNOWN; 63} 64 65 66static unsigned int 67hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED, 68 hb_codepoint_t unicode, 69 void *user_data HB_UNUSED) 70 71{ 72 return u_getCombiningClass (unicode); 73} 74 75static unsigned int 76hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED, 77 hb_codepoint_t unicode, 78 void *user_data HB_UNUSED) 79{ 80 switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) 81 { 82 case U_EA_WIDE: 83 case U_EA_FULLWIDTH: 84 return 2; 85 case U_EA_NEUTRAL: 86 case U_EA_AMBIGUOUS: 87 case U_EA_HALFWIDTH: 88 case U_EA_NARROW: 89 return 1; 90 } 91 return 1; 92} 93 94static hb_unicode_general_category_t 95hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED, 96 hb_codepoint_t unicode, 97 void *user_data HB_UNUSED) 98{ 99 switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) 100 { 101 case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 102 103 case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; 104 case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; 105 case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; 106 case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; 107 case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; 108 109 case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; 110 case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; 111 case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK; 112 113 case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; 114 case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; 115 case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; 116 117 case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; 118 case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; 119 case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; 120 121 case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; 122 case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; 123 case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; 124 case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; 125 126 127 case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; 128 case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; 129 case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; 130 case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; 131 case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; 132 133 case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; 134 case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; 135 case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; 136 case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; 137 138 case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; 139 case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; 140 } 141 142 return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 143} 144 145static hb_codepoint_t 146hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED, 147 hb_codepoint_t unicode, 148 void *user_data HB_UNUSED) 149{ 150 return u_charMirror(unicode); 151} 152 153static hb_script_t 154hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED, 155 hb_codepoint_t unicode, 156 void *user_data HB_UNUSED) 157{ 158 UErrorCode status = U_ZERO_ERROR; 159 UScriptCode scriptCode = uscript_getScript(unicode, &status); 160 161 if (unlikely (U_FAILURE (status))) 162 return HB_SCRIPT_UNKNOWN; 163 164 return hb_icu_script_to_script (scriptCode); 165} 166 167static hb_bool_t 168hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 169 hb_codepoint_t a, 170 hb_codepoint_t b, 171 hb_codepoint_t *ab, 172 void *user_data HB_UNUSED) 173{ 174 if (!a || !b) 175 return false; 176 177 UChar utf16[4], normalized[5]; 178 int len; 179 hb_bool_t ret, err; 180 UErrorCode icu_err; 181 182 len = 0; 183 err = false; 184 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err); 185 if (err) return false; 186 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err); 187 if (err) return false; 188 189 icu_err = U_ZERO_ERROR; 190 len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); 191 if (U_FAILURE (icu_err)) 192 return false; 193 if (u_countChar32 (normalized, len) == 1) { 194 U16_GET_UNSAFE (normalized, 0, *ab); 195 ret = true; 196 } else { 197 ret = false; 198 } 199 200 return ret; 201} 202 203static hb_bool_t 204hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, 205 hb_codepoint_t ab, 206 hb_codepoint_t *a, 207 hb_codepoint_t *b, 208 void *user_data HB_UNUSED) 209{ 210 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; 211 int len; 212 hb_bool_t ret, err; 213 UErrorCode icu_err; 214 215 /* This function is a monster! Maybe it wasn't a good idea adding a 216 * pairwise decompose API... */ 217 /* Watchout for the dragons. Err, watchout for macros changing len. */ 218 219 len = 0; 220 err = false; 221 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); 222 if (err) return false; 223 224 icu_err = U_ZERO_ERROR; 225 len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); 226 if (U_FAILURE (icu_err)) 227 return false; 228 229 len = u_countChar32 (normalized, len); 230 231 if (len == 1) { 232 U16_GET_UNSAFE (normalized, 0, *a); 233 *b = 0; 234 ret = *a != ab; 235 } else if (len == 2) { 236 len =0; 237 U16_NEXT_UNSAFE (normalized, len, *a); 238 U16_NEXT_UNSAFE (normalized, len, *b); 239 240 /* Here's the ugly part: if ab decomposes to a single character and 241 * that character decomposes again, we have to detect that and undo 242 * the second part :-(. */ 243 UChar recomposed[20]; 244 icu_err = U_ZERO_ERROR; 245 unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); 246 if (U_FAILURE (icu_err)) 247 return false; 248 hb_codepoint_t c; 249 U16_GET_UNSAFE (recomposed, 0, c); 250 if (c != *a && c != ab) { 251 *a = c; 252 *b = 0; 253 } 254 ret = true; 255 } else { 256 /* If decomposed to more than two characters, take the last one, 257 * and recompose the rest to get the first component. */ 258 U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */ 259 UChar recomposed[18 * 2]; 260 icu_err = U_ZERO_ERROR; 261 len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); 262 if (U_FAILURE (icu_err)) 263 return false; 264 /* We expect that recomposed has exactly one character now. */ 265 if (unlikely (u_countChar32 (recomposed, len) != 1)) 266 return false; 267 U16_GET_UNSAFE (recomposed, 0, *a); 268 ret = true; 269 } 270 271 return ret; 272} 273 274static unsigned int 275hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, 276 hb_codepoint_t u, 277 hb_codepoint_t *decomposed, 278 void *user_data HB_UNUSED) 279{ 280 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; 281 gint len; 282 int32_t utf32_len; 283 hb_bool_t err; 284 UErrorCode icu_err; 285 286 /* Copy @u into a UTF-16 array to be passed to ICU. */ 287 len = 0; 288 err = FALSE; 289 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err); 290 if (err) 291 return 0; 292 293 /* Normalise the codepoint using NFKD mode. */ 294 icu_err = U_ZERO_ERROR; 295 len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); 296 if (icu_err) 297 return 0; 298 299 /* Convert the decomposed form from UTF-16 to UTF-32. */ 300 icu_err = U_ZERO_ERROR; 301 u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); 302 if (icu_err) 303 return 0; 304 305 return utf32_len; 306} 307 308 309extern HB_INTERNAL const hb_unicode_funcs_t _hb_icu_unicode_funcs; 310const hb_unicode_funcs_t _hb_icu_unicode_funcs = { 311 HB_OBJECT_HEADER_STATIC, 312 313 NULL, /* parent */ 314 true, /* immutable */ 315 { 316#define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name, 317 HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS 318#undef HB_UNICODE_FUNC_IMPLEMENT 319 } 320}; 321 322hb_unicode_funcs_t * 323hb_icu_get_unicode_funcs (void) 324{ 325 return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs); 326} 327 328 329