hb-icu.cc revision 378d279bbf692195c4654e312dae854ab3be04cf
1e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger/*
2e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Copyright © 2009  Red Hat, Inc.
3e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Copyright © 2009  Keith Stribley
4e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Copyright © 2011  Google, Inc.
5e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger *
6e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger *  This is part of HarfBuzz, a text shaping library.
7e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger *
8e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * Permission is hereby granted, without written agreement and without
9e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * license or royalty fees, to use, copy, modify, and distribute this
10e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * software and its documentation for any purpose, provided that the
11e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * above copyright notice and the following two paragraphs appear in
12e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * all copies of this software.
13e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger *
14e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
15e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
16e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
17e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
18e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger * DAMAGE.
19e23af2a86ed22c2a11d820820b78353b095e7ae7Joerg Sonnenberger *
20 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
22 * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
23 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
24 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
25 *
26 * Red Hat Author(s): Behdad Esfahbod
27 * Google Author(s): Behdad Esfahbod
28 */
29
30#include "hb-private.hh"
31
32#include "hb-icu.h"
33
34#include "hb-unicode-private.hh"
35
36#include <unicode/uversion.h>
37#include <unicode/uchar.h>
38#include <unicode/unorm.h>
39#include <unicode/ustring.h>
40
41
42
43hb_script_t
44hb_icu_script_to_script (UScriptCode script)
45{
46  if (unlikely (script == USCRIPT_INVALID_CODE))
47    return HB_SCRIPT_INVALID;
48
49  return hb_script_from_string (uscript_getShortName (script), -1);
50}
51
52UScriptCode
53hb_icu_script_from_script (hb_script_t script)
54{
55  if (unlikely (script == HB_SCRIPT_INVALID))
56    return USCRIPT_INVALID_CODE;
57
58  for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++)
59    if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
60      return (UScriptCode) i;
61
62  return USCRIPT_UNKNOWN;
63}
64
65
66static unsigned int
67hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
68				hb_codepoint_t      unicode,
69				void               *user_data HB_UNUSED)
70
71{
72  return u_getCombiningClass (unicode);
73}
74
75static unsigned int
76hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED,
77				hb_codepoint_t      unicode,
78				void               *user_data HB_UNUSED)
79{
80  switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
81  {
82  case U_EA_WIDE:
83  case U_EA_FULLWIDTH:
84    return 2;
85  case U_EA_NEUTRAL:
86  case U_EA_AMBIGUOUS:
87  case U_EA_HALFWIDTH:
88  case U_EA_NARROW:
89    return 1;
90  }
91  return 1;
92}
93
94static hb_unicode_general_category_t
95hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
96				 hb_codepoint_t      unicode,
97				 void               *user_data HB_UNUSED)
98{
99  switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
100  {
101  case U_UNASSIGNED:			return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
102
103  case U_UPPERCASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
104  case U_LOWERCASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
105  case U_TITLECASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
106  case U_MODIFIER_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
107  case U_OTHER_LETTER:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
108
109  case U_NON_SPACING_MARK:		return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
110  case U_ENCLOSING_MARK:		return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
111  case U_COMBINING_SPACING_MARK:	return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
112
113  case U_DECIMAL_DIGIT_NUMBER:		return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
114  case U_LETTER_NUMBER:			return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
115  case U_OTHER_NUMBER:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
116
117  case U_SPACE_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
118  case U_LINE_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
119  case U_PARAGRAPH_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
120
121  case U_CONTROL_CHAR:			return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
122  case U_FORMAT_CHAR:			return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
123  case U_PRIVATE_USE_CHAR:		return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
124  case U_SURROGATE:			return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
125
126
127  case U_DASH_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
128  case U_START_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
129  case U_END_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
130  case U_CONNECTOR_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
131  case U_OTHER_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
132
133  case U_MATH_SYMBOL:			return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
134  case U_CURRENCY_SYMBOL:		return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
135  case U_MODIFIER_SYMBOL:		return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
136  case U_OTHER_SYMBOL:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
137
138  case U_INITIAL_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
139  case U_FINAL_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
140  }
141
142  return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
143}
144
145static hb_codepoint_t
146hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
147			  hb_codepoint_t      unicode,
148			  void               *user_data HB_UNUSED)
149{
150  return u_charMirror(unicode);
151}
152
153static hb_script_t
154hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
155		       hb_codepoint_t      unicode,
156		       void               *user_data HB_UNUSED)
157{
158  UErrorCode status = U_ZERO_ERROR;
159  UScriptCode scriptCode = uscript_getScript(unicode, &status);
160
161  if (unlikely (U_FAILURE (status)))
162    return HB_SCRIPT_UNKNOWN;
163
164  return hb_icu_script_to_script (scriptCode);
165}
166
167static hb_bool_t
168hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
169			hb_codepoint_t      a,
170			hb_codepoint_t      b,
171			hb_codepoint_t     *ab,
172			void               *user_data HB_UNUSED)
173{
174  if (!a || !b)
175    return false;
176
177  UChar utf16[4], normalized[5];
178  int len;
179  hb_bool_t ret, err;
180  UErrorCode icu_err;
181
182  len = 0;
183  err = false;
184  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
185  if (err) return false;
186  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
187  if (err) return false;
188
189  icu_err = U_ZERO_ERROR;
190  len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
191  if (U_FAILURE (icu_err))
192    return false;
193  if (u_countChar32 (normalized, len) == 1) {
194    U16_GET_UNSAFE (normalized, 0, *ab);
195    ret = true;
196  } else {
197    ret = false;
198  }
199
200  return ret;
201}
202
203static hb_bool_t
204hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
205			  hb_codepoint_t      ab,
206			  hb_codepoint_t     *a,
207			  hb_codepoint_t     *b,
208			  void               *user_data HB_UNUSED)
209{
210  UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
211  int len;
212  hb_bool_t ret, err;
213  UErrorCode icu_err;
214
215  /* This function is a monster! Maybe it wasn't a good idea adding a
216   * pairwise decompose API... */
217  /* Watchout for the dragons.  Err, watchout for macros changing len. */
218
219  len = 0;
220  err = false;
221  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
222  if (err) return false;
223
224  icu_err = U_ZERO_ERROR;
225  len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
226  if (U_FAILURE (icu_err))
227    return false;
228
229  len = u_countChar32 (normalized, len);
230
231  if (len == 1) {
232    U16_GET_UNSAFE (normalized, 0, *a);
233    *b = 0;
234    ret = *a != ab;
235  } else if (len == 2) {
236    len =0;
237    U16_NEXT_UNSAFE (normalized, len, *a);
238    U16_NEXT_UNSAFE (normalized, len, *b);
239
240    /* Here's the ugly part: if ab decomposes to a single character and
241     * that character decomposes again, we have to detect that and undo
242     * the second part :-(. */
243    UChar recomposed[20];
244    icu_err = U_ZERO_ERROR;
245    unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
246    if (U_FAILURE (icu_err))
247      return false;
248    hb_codepoint_t c;
249    U16_GET_UNSAFE (recomposed, 0, c);
250    if (c != *a && c != ab) {
251      *a = c;
252      *b = 0;
253    }
254    ret = true;
255  } else {
256    /* If decomposed to more than two characters, take the last one,
257     * and recompose the rest to get the first component. */
258    U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */
259    UChar recomposed[18 * 2];
260    icu_err = U_ZERO_ERROR;
261    len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
262    if (U_FAILURE (icu_err))
263      return false;
264    /* We expect that recomposed has exactly one character now. */
265    if (unlikely (u_countChar32 (recomposed, len) != 1))
266      return false;
267    U16_GET_UNSAFE (recomposed, 0, *a);
268    ret = true;
269  }
270
271  return ret;
272}
273
274static unsigned int
275hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED,
276					hb_codepoint_t      u,
277					hb_codepoint_t     *decomposed,
278					void               *user_data HB_UNUSED)
279{
280  UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
281  gint len;
282  int32_t utf32_len;
283  hb_bool_t err;
284  UErrorCode icu_err;
285
286  /* Copy @u into a UTF-16 array to be passed to ICU. */
287  len = 0;
288  err = FALSE;
289  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err);
290  if (err)
291    return 0;
292
293  /* Normalise the codepoint using NFKD mode. */
294  icu_err = U_ZERO_ERROR;
295  len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
296  if (icu_err)
297    return 0;
298
299  /* Convert the decomposed form from UTF-16 to UTF-32. */
300  icu_err = U_ZERO_ERROR;
301  u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err);
302  if (icu_err)
303    return 0;
304
305  return utf32_len;
306}
307
308
309extern HB_INTERNAL const hb_unicode_funcs_t _hb_icu_unicode_funcs;
310const hb_unicode_funcs_t _hb_icu_unicode_funcs = {
311  HB_OBJECT_HEADER_STATIC,
312
313  NULL, /* parent */
314  true, /* immutable */
315  {
316#define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name,
317    HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
318#undef HB_UNICODE_FUNC_IMPLEMENT
319  }
320};
321
322hb_unicode_funcs_t *
323hb_icu_get_unicode_funcs (void)
324{
325  return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs);
326}
327
328
329