hb-utf-private.hh revision e634fed4285ce440d277345727ed01757df6d779
1/*
2 * Copyright © 2011,2012,2014  Google, Inc.
3 *
4 *  This is part of HarfBuzz, a text shaping library.
5 *
6 * Permission is hereby granted, without written agreement and without
7 * license or royalty fees, to use, copy, modify, and distribute this
8 * software and its documentation for any purpose, provided that the
9 * above copyright notice and the following two paragraphs appear in
10 * all copies of this software.
11 *
12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16 * DAMAGE.
17 *
18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20 * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23 *
24 * Google Author(s): Behdad Esfahbod
25 */
26
27#ifndef HB_UTF_PRIVATE_HH
28#define HB_UTF_PRIVATE_HH
29
30#include "hb-private.hh"
31
32
33/* UTF-8 */
34
35static inline const uint8_t *
36hb_utf_next (const uint8_t *text,
37	     const uint8_t *end,
38	     hb_codepoint_t *unicode)
39{
40  /* Written to only accept well-formed sequences.
41   * Based on ideas from ICU's U8_NEXT.
42   * Generates a -1 for each ill-formed byte. */
43
44  hb_codepoint_t c = *text++;
45
46  if (c > 0x7Fu)
47  {
48    if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
49    {
50      unsigned int t1;
51      if (likely (text < end &&
52		  (t1 = text[0] - 0x80u) <= 0x3Fu))
53      {
54	c = ((c&0x1Fu)<<6) | t1;
55	text++;
56      }
57      else
58	goto error;
59    }
60    else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
61    {
62      unsigned int t1, t2;
63      if (likely (1 < end - text &&
64		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
65		  (t2 = text[1] - 0x80u) <= 0x3Fu))
66      {
67	c = ((c&0xFu)<<12) | (t1<<6) | t2;
68	if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
69	  goto error;
70	text += 2;
71      }
72      else
73	goto error;
74    }
75    else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
76    {
77      unsigned int t1, t2, t3;
78      if (likely (2 < end - text &&
79		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
80		  (t2 = text[1] - 0x80u) <= 0x3Fu &&
81		  (t3 = text[2] - 0x80u) <= 0x3Fu))
82      {
83	c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
84	if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
85	  goto error;
86	text += 3;
87      }
88      else
89	goto error;
90    }
91    else
92      goto error;
93  }
94
95  *unicode = c;
96  return text;
97
98error:
99  *unicode = -1;
100  return text;
101}
102
103static inline const uint8_t *
104hb_utf_prev (const uint8_t *text,
105	     const uint8_t *start,
106	     hb_codepoint_t *unicode)
107{
108  const uint8_t *end = text--;
109  while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
110    text--;
111
112  if (likely (hb_utf_next (text, end, unicode) == end))
113    return text;
114
115  *unicode = -1;
116  return end - 1;
117}
118
119
120static inline unsigned int
121hb_utf_strlen (const uint8_t *text)
122{
123  return strlen ((const char *) text);
124}
125
126
127/* UTF-16 */
128
129static inline const uint16_t *
130hb_utf_next (const uint16_t *text,
131	     const uint16_t *end,
132	     hb_codepoint_t *unicode)
133{
134  hb_codepoint_t c = *text++;
135
136  if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
137  {
138    *unicode = c;
139    return text;
140  }
141
142  if (likely (hb_in_range (c, 0xD800u, 0xDBFFu)))
143  {
144    /* High-surrogate in c */
145    hb_codepoint_t l;
146    if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu))))
147    {
148      /* Low-surrogate in l */
149      *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
150       text++;
151       return text;
152    }
153  }
154
155  /* Lonely / out-of-order surrogate. */
156  *unicode = -1;
157  return text;
158}
159
160static inline const uint16_t *
161hb_utf_prev (const uint16_t *text,
162	     const uint16_t *start,
163	     hb_codepoint_t *unicode)
164{
165  const uint16_t *end = text--;
166  hb_codepoint_t c = *text;
167
168  if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
169  {
170    *unicode = c;
171    return text;
172  }
173
174  if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
175    text--;
176
177  if (likely (hb_utf_next (text, end, unicode) == end))
178    return text;
179
180  *unicode = -1;
181  return end - 1;
182}
183
184
185static inline unsigned int
186hb_utf_strlen (const uint16_t *text)
187{
188  unsigned int l = 0;
189  while (*text++) l++;
190  return l;
191}
192
193
194/* UTF-32 */
195
196static inline const uint32_t *
197hb_utf_next (const uint32_t *text,
198	     const uint32_t *end HB_UNUSED,
199	     hb_codepoint_t *unicode)
200{
201  hb_codepoint_t c = *text++;
202  if (unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
203    goto error;
204  *unicode = c;
205  return text;
206
207error:
208  *unicode = -1;
209  return text;
210}
211
212static inline const uint32_t *
213hb_utf_prev (const uint32_t *text,
214	     const uint32_t *start HB_UNUSED,
215	     hb_codepoint_t *unicode)
216{
217  hb_utf_next (text - 1, text, unicode);
218  return text - 1;
219}
220
221static inline unsigned int
222hb_utf_strlen (const uint32_t *text)
223{
224  unsigned int l = 0;
225  while (*text++) l++;
226  return l;
227}
228
229
230#endif /* HB_UTF_PRIVATE_HH */
231