1/*
2 * Copyright © 2009  Red Hat, Inc.
3 * Copyright © 2009  Keith Stribley
4 * Copyright © 2011  Google, Inc.
5 *
6 *  This is part of HarfBuzz, a text shaping library.
7 *
8 * Permission is hereby granted, without written agreement and without
9 * license or royalty fees, to use, copy, modify, and distribute this
10 * software and its documentation for any purpose, provided that the
11 * above copyright notice and the following two paragraphs appear in
12 * all copies of this software.
13 *
14 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
15 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
16 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
17 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
18 * DAMAGE.
19 *
20 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
22 * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
23 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
24 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
25 *
26 * Red Hat Author(s): Behdad Esfahbod
27 * Google Author(s): Behdad Esfahbod
28 */
29
30#include "hb-private.hh"
31
32#include "hb-icu.h"
33
34#include "hb-unicode-private.hh"
35
36#include <unicode/uchar.h>
37#include <unicode/unorm.h>
38#include <unicode/ustring.h>
39#include <unicode/utf16.h>
40#include <unicode/uversion.h>
41
42
43hb_script_t
44hb_icu_script_to_script (UScriptCode script)
45{
46  if (unlikely (script == USCRIPT_INVALID_CODE))
47    return HB_SCRIPT_INVALID;
48
49  return hb_script_from_string (uscript_getShortName (script), -1);
50}
51
52UScriptCode
53hb_icu_script_from_script (hb_script_t script)
54{
55  if (unlikely (script == HB_SCRIPT_INVALID))
56    return USCRIPT_INVALID_CODE;
57
58  for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++)
59    if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
60      return (UScriptCode) i;
61
62  return USCRIPT_UNKNOWN;
63}
64
65
66static hb_unicode_combining_class_t
67hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
68				hb_codepoint_t      unicode,
69				void               *user_data HB_UNUSED)
70
71{
72  return (hb_unicode_combining_class_t) u_getCombiningClass (unicode);
73}
74
75static unsigned int
76hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED,
77				hb_codepoint_t      unicode,
78				void               *user_data HB_UNUSED)
79{
80  switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
81  {
82  case U_EA_WIDE:
83  case U_EA_FULLWIDTH:
84    return 2;
85  case U_EA_NEUTRAL:
86  case U_EA_AMBIGUOUS:
87  case U_EA_HALFWIDTH:
88  case U_EA_NARROW:
89    return 1;
90  }
91  return 1;
92}
93
94static hb_unicode_general_category_t
95hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
96				 hb_codepoint_t      unicode,
97				 void               *user_data HB_UNUSED)
98{
99  switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
100  {
101  case U_UNASSIGNED:			return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
102
103  case U_UPPERCASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
104  case U_LOWERCASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
105  case U_TITLECASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
106  case U_MODIFIER_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
107  case U_OTHER_LETTER:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
108
109  case U_NON_SPACING_MARK:		return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
110  case U_ENCLOSING_MARK:		return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
111  case U_COMBINING_SPACING_MARK:	return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
112
113  case U_DECIMAL_DIGIT_NUMBER:		return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
114  case U_LETTER_NUMBER:			return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
115  case U_OTHER_NUMBER:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
116
117  case U_SPACE_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
118  case U_LINE_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
119  case U_PARAGRAPH_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
120
121  case U_CONTROL_CHAR:			return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
122  case U_FORMAT_CHAR:			return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
123  case U_PRIVATE_USE_CHAR:		return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
124  case U_SURROGATE:			return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
125
126
127  case U_DASH_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
128  case U_START_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
129  case U_END_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
130  case U_CONNECTOR_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
131  case U_OTHER_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
132
133  case U_MATH_SYMBOL:			return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
134  case U_CURRENCY_SYMBOL:		return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
135  case U_MODIFIER_SYMBOL:		return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
136  case U_OTHER_SYMBOL:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
137
138  case U_INITIAL_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
139  case U_FINAL_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
140  }
141
142  return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
143}
144
145static hb_codepoint_t
146hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
147			  hb_codepoint_t      unicode,
148			  void               *user_data HB_UNUSED)
149{
150  return u_charMirror(unicode);
151}
152
153static hb_script_t
154hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
155		       hb_codepoint_t      unicode,
156		       void               *user_data HB_UNUSED)
157{
158  UErrorCode status = U_ZERO_ERROR;
159  UScriptCode scriptCode = uscript_getScript(unicode, &status);
160
161  if (unlikely (U_FAILURE (status)))
162    return HB_SCRIPT_UNKNOWN;
163
164  return hb_icu_script_to_script (scriptCode);
165}
166
167#if U_ICU_VERSION_MAJOR_NUM >= 49
168static const UNormalizer2 *normalizer;
169#endif
170
171static hb_bool_t
172hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
173			hb_codepoint_t      a,
174			hb_codepoint_t      b,
175			hb_codepoint_t     *ab,
176			void               *user_data HB_UNUSED)
177{
178#if U_ICU_VERSION_MAJOR_NUM >= 49
179  {
180    UChar32 ret = unorm2_composePair (normalizer, a, b);
181    if (ret < 0) return false;
182    *ab = ret;
183    return true;
184  }
185#endif
186
187  /* We don't ifdef-out the fallback code such that compiler always
188   * sees it and makes sure it's compilable. */
189
190  UChar utf16[4], normalized[5];
191  unsigned int len;
192  hb_bool_t ret, err;
193  UErrorCode icu_err;
194
195  len = 0;
196  err = false;
197  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
198  if (err) return false;
199  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
200  if (err) return false;
201
202  icu_err = U_ZERO_ERROR;
203  len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
204  if (U_FAILURE (icu_err))
205    return false;
206  if (u_countChar32 (normalized, len) == 1) {
207    U16_GET_UNSAFE (normalized, 0, *ab);
208    ret = true;
209  } else {
210    ret = false;
211  }
212
213  return ret;
214}
215
216static hb_bool_t
217hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
218			  hb_codepoint_t      ab,
219			  hb_codepoint_t     *a,
220			  hb_codepoint_t     *b,
221			  void               *user_data HB_UNUSED)
222{
223#if U_ICU_VERSION_MAJOR_NUM >= 49
224  {
225    UChar decomposed[4];
226    int len;
227    UErrorCode icu_err = U_ZERO_ERROR;
228    len = unorm2_getRawDecomposition (normalizer, ab, decomposed,
229				      ARRAY_LENGTH (decomposed), &icu_err);
230    if (U_FAILURE (icu_err) || len < 0) return false;
231
232    len = u_countChar32 (decomposed, len);
233    if (len == 1) {
234      U16_GET_UNSAFE (decomposed, 0, *a);
235      *b = 0;
236      return *a != ab;
237    } else if (len == 2) {
238      len =0;
239      U16_NEXT_UNSAFE (decomposed, len, *a);
240      U16_NEXT_UNSAFE (decomposed, len, *b);
241    }
242    return true;
243  }
244#endif
245
246  /* We don't ifdef-out the fallback code such that compiler always
247   * sees it and makes sure it's compilable. */
248
249  UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
250  unsigned int len;
251  hb_bool_t ret, err;
252  UErrorCode icu_err;
253
254  /* This function is a monster! Maybe it wasn't a good idea adding a
255   * pairwise decompose API... */
256  /* Watchout for the dragons.  Err, watchout for macros changing len. */
257
258  len = 0;
259  err = false;
260  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
261  if (err) return false;
262
263  icu_err = U_ZERO_ERROR;
264  len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
265  if (U_FAILURE (icu_err))
266    return false;
267
268  len = u_countChar32 (normalized, len);
269
270  if (len == 1) {
271    U16_GET_UNSAFE (normalized, 0, *a);
272    *b = 0;
273    ret = *a != ab;
274  } else if (len == 2) {
275    len =0;
276    U16_NEXT_UNSAFE (normalized, len, *a);
277    U16_NEXT_UNSAFE (normalized, len, *b);
278
279    /* Here's the ugly part: if ab decomposes to a single character and
280     * that character decomposes again, we have to detect that and undo
281     * the second part :-(. */
282    UChar recomposed[20];
283    icu_err = U_ZERO_ERROR;
284    unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
285    if (U_FAILURE (icu_err))
286      return false;
287    hb_codepoint_t c;
288    U16_GET_UNSAFE (recomposed, 0, c);
289    if (c != *a && c != ab) {
290      *a = c;
291      *b = 0;
292    }
293    ret = true;
294  } else {
295    /* If decomposed to more than two characters, take the last one,
296     * and recompose the rest to get the first component. */
297    U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */
298    UChar recomposed[18 * 2];
299    icu_err = U_ZERO_ERROR;
300    len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
301    if (U_FAILURE (icu_err))
302      return false;
303    /* We expect that recomposed has exactly one character now. */
304    if (unlikely (u_countChar32 (recomposed, len) != 1))
305      return false;
306    U16_GET_UNSAFE (recomposed, 0, *a);
307    ret = true;
308  }
309
310  return ret;
311}
312
313static unsigned int
314hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED,
315					hb_codepoint_t      u,
316					hb_codepoint_t     *decomposed,
317					void               *user_data HB_UNUSED)
318{
319  UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
320  unsigned int len;
321  int32_t utf32_len;
322  hb_bool_t err;
323  UErrorCode icu_err;
324
325  /* Copy @u into a UTF-16 array to be passed to ICU. */
326  len = 0;
327  err = false;
328  U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err);
329  if (err)
330    return 0;
331
332  /* Normalise the codepoint using NFKD mode. */
333  icu_err = U_ZERO_ERROR;
334  len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
335  if (icu_err)
336    return 0;
337
338  /* Convert the decomposed form from UTF-16 to UTF-32. */
339  icu_err = U_ZERO_ERROR;
340  u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err);
341  if (icu_err)
342    return 0;
343
344  return utf32_len;
345}
346
347
348hb_unicode_funcs_t *
349hb_icu_get_unicode_funcs (void)
350{
351  static const hb_unicode_funcs_t _hb_icu_unicode_funcs = {
352    HB_OBJECT_HEADER_STATIC,
353
354    NULL, /* parent */
355    true, /* immutable */
356    {
357#define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name,
358      HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
359#undef HB_UNICODE_FUNC_IMPLEMENT
360    }
361  };
362
363#if U_ICU_VERSION_MAJOR_NUM >= 49
364  if (!hb_atomic_ptr_get (&normalizer)) {
365    UErrorCode icu_err = U_ZERO_ERROR;
366    /* We ignore failure in getNFCInstace(). */
367    (void) hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err));
368  }
369#endif
370  return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs);
371}
372