1/* GLIB - Library of useful routines for C programming
2 * Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 02111-1307, USA.
18 */
19
20#include "glib.h"
21
22#define UNICODE_VALID(Char)                   \
23    ((Char) < 0x110000 &&                     \
24     (((Char) & 0xFFFFF800) != 0xD800) &&     \
25     ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
26     ((Char) & 0xFFFE) != 0xFFFE)
27
28
29
30static gboolean any_failed = FALSE;
31
32struct {
33  const gchar *text;
34  gint max_len;
35  gint offset;
36  gboolean valid;
37} test[] = {
38  /* some tests to check max_len handling */
39  /* length 1 */
40  { "abcde", -1, 5, TRUE },
41  { "abcde", 3, 3, TRUE },
42  { "abcde", 5, 5, TRUE },
43  { "abcde", 7, 5, FALSE },
44  /* length 2 */
45  { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE },
46  { "\xc2\xa9\xc2\xa9\xc2\xa9",  1, 0, FALSE },
47  { "\xc2\xa9\xc2\xa9\xc2\xa9",  2, 2, TRUE },
48  { "\xc2\xa9\xc2\xa9\xc2\xa9",  3, 2, FALSE },
49  { "\xc2\xa9\xc2\xa9\xc2\xa9",  4, 4, TRUE },
50  { "\xc2\xa9\xc2\xa9\xc2\xa9",  5, 4, FALSE },
51  { "\xc2\xa9\xc2\xa9\xc2\xa9",  6, 6, TRUE },
52  { "\xc2\xa9\xc2\xa9\xc2\xa9",  7, 6, FALSE },
53  /* length 3 */
54  { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE },
55  { "\xe2\x89\xa0\xe2\x89\xa0",  1, 0, FALSE },
56  { "\xe2\x89\xa0\xe2\x89\xa0",  2, 0, FALSE },
57  { "\xe2\x89\xa0\xe2\x89\xa0",  3, 3, TRUE },
58  { "\xe2\x89\xa0\xe2\x89\xa0",  4, 3, FALSE },
59  { "\xe2\x89\xa0\xe2\x89\xa0",  5, 3, FALSE },
60  { "\xe2\x89\xa0\xe2\x89\xa0",  6, 6, TRUE },
61  { "\xe2\x89\xa0\xe2\x89\xa0",  7, 6, FALSE },
62
63  /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
64  /* greek 'kosme' */
65  { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE },
66  /* first sequence of each length */
67  { "\x00", -1, 0, TRUE },
68  { "\xc2\x80", -1, 2, TRUE },
69  { "\xe0\xa0\x80", -1, 3, TRUE },
70  { "\xf0\x90\x80\x80", -1, 4, TRUE },
71  { "\xf8\x88\x80\x80\x80", -1, 0, FALSE },
72  { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE },
73  /* last sequence of each length */
74  { "\x7f", -1, 1, TRUE },
75  { "\xdf\xbf", -1, 2, TRUE },
76  { "\xef\xbf\xbf", -1, 0, FALSE },
77  { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
78  { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
79  { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
80  /* other boundary conditions */
81  { "\xed\x9f\xbf", -1, 3, TRUE },
82  { "\xee\x80\x80", -1, 3, TRUE },
83  { "\xef\xbf\xbd", -1, 3, TRUE },
84  { "\xf4\x8f\xbf\xbf", -1, 0, FALSE },
85  { "\xf4\x90\x80\x80", -1, 0, FALSE },
86  /* malformed sequences */
87  /* continuation bytes */
88  { "\x80", -1, 0, FALSE },
89  { "\xbf", -1, 0, FALSE },
90  { "\x80\xbf", -1, 0, FALSE },
91  { "\x80\xbf\x80", -1, 0, FALSE },
92  { "\x80\xbf\x80\xbf", -1, 0, FALSE },
93  { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
94  { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
95  { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
96
97  /* all possible continuation byte */
98  { "\x80", -1, 0, FALSE },
99  { "\x81", -1, 0, FALSE },
100  { "\x82", -1, 0, FALSE },
101  { "\x83", -1, 0, FALSE },
102  { "\x84", -1, 0, FALSE },
103  { "\x85", -1, 0, FALSE },
104  { "\x86", -1, 0, FALSE },
105  { "\x87", -1, 0, FALSE },
106  { "\x88", -1, 0, FALSE },
107  { "\x89", -1, 0, FALSE },
108  { "\x8a", -1, 0, FALSE },
109  { "\x8b", -1, 0, FALSE },
110  { "\x8c", -1, 0, FALSE },
111  { "\x8d", -1, 0, FALSE },
112  { "\x8e", -1, 0, FALSE },
113  { "\x8f", -1, 0, FALSE },
114  { "\x90", -1, 0, FALSE },
115  { "\x91", -1, 0, FALSE },
116  { "\x92", -1, 0, FALSE },
117  { "\x93", -1, 0, FALSE },
118  { "\x94", -1, 0, FALSE },
119  { "\x95", -1, 0, FALSE },
120  { "\x96", -1, 0, FALSE },
121  { "\x97", -1, 0, FALSE },
122  { "\x98", -1, 0, FALSE },
123  { "\x99", -1, 0, FALSE },
124  { "\x9a", -1, 0, FALSE },
125  { "\x9b", -1, 0, FALSE },
126  { "\x9c", -1, 0, FALSE },
127  { "\x9d", -1, 0, FALSE },
128  { "\x9e", -1, 0, FALSE },
129  { "\x9f", -1, 0, FALSE },
130  { "\xa0", -1, 0, FALSE },
131  { "\xa1", -1, 0, FALSE },
132  { "\xa2", -1, 0, FALSE },
133  { "\xa3", -1, 0, FALSE },
134  { "\xa4", -1, 0, FALSE },
135  { "\xa5", -1, 0, FALSE },
136  { "\xa6", -1, 0, FALSE },
137  { "\xa7", -1, 0, FALSE },
138  { "\xa8", -1, 0, FALSE },
139  { "\xa9", -1, 0, FALSE },
140  { "\xaa", -1, 0, FALSE },
141  { "\xab", -1, 0, FALSE },
142  { "\xac", -1, 0, FALSE },
143  { "\xad", -1, 0, FALSE },
144  { "\xae", -1, 0, FALSE },
145  { "\xaf", -1, 0, FALSE },
146  { "\xb0", -1, 0, FALSE },
147  { "\xb1", -1, 0, FALSE },
148  { "\xb2", -1, 0, FALSE },
149  { "\xb3", -1, 0, FALSE },
150  { "\xb4", -1, 0, FALSE },
151  { "\xb5", -1, 0, FALSE },
152  { "\xb6", -1, 0, FALSE },
153  { "\xb7", -1, 0, FALSE },
154  { "\xb8", -1, 0, FALSE },
155  { "\xb9", -1, 0, FALSE },
156  { "\xba", -1, 0, FALSE },
157  { "\xbb", -1, 0, FALSE },
158  { "\xbc", -1, 0, FALSE },
159  { "\xbd", -1, 0, FALSE },
160  { "\xbe", -1, 0, FALSE },
161  { "\xbf", -1, 0, FALSE },
162  /* lone start characters */
163  { "\xc0\x20", -1, 0, FALSE },
164  { "\xc1\x20", -1, 0, FALSE },
165  { "\xc2\x20", -1, 0, FALSE },
166  { "\xc3\x20", -1, 0, FALSE },
167  { "\xc4\x20", -1, 0, FALSE },
168  { "\xc5\x20", -1, 0, FALSE },
169  { "\xc6\x20", -1, 0, FALSE },
170  { "\xc7\x20", -1, 0, FALSE },
171  { "\xc8\x20", -1, 0, FALSE },
172  { "\xc9\x20", -1, 0, FALSE },
173  { "\xca\x20", -1, 0, FALSE },
174  { "\xcb\x20", -1, 0, FALSE },
175  { "\xcc\x20", -1, 0, FALSE },
176  { "\xcd\x20", -1, 0, FALSE },
177  { "\xce\x20", -1, 0, FALSE },
178  { "\xcf\x20", -1, 0, FALSE },
179  { "\xd0\x20", -1, 0, FALSE },
180  { "\xd1\x20", -1, 0, FALSE },
181  { "\xd2\x20", -1, 0, FALSE },
182  { "\xd3\x20", -1, 0, FALSE },
183  { "\xd4\x20", -1, 0, FALSE },
184  { "\xd5\x20", -1, 0, FALSE },
185  { "\xd6\x20", -1, 0, FALSE },
186  { "\xd7\x20", -1, 0, FALSE },
187  { "\xd8\x20", -1, 0, FALSE },
188  { "\xd9\x20", -1, 0, FALSE },
189  { "\xda\x20", -1, 0, FALSE },
190  { "\xdb\x20", -1, 0, FALSE },
191  { "\xdc\x20", -1, 0, FALSE },
192  { "\xdd\x20", -1, 0, FALSE },
193  { "\xde\x20", -1, 0, FALSE },
194  { "\xdf\x20", -1, 0, FALSE },
195  { "\xe0\x20", -1, 0, FALSE },
196  { "\xe1\x20", -1, 0, FALSE },
197  { "\xe2\x20", -1, 0, FALSE },
198  { "\xe3\x20", -1, 0, FALSE },
199  { "\xe4\x20", -1, 0, FALSE },
200  { "\xe5\x20", -1, 0, FALSE },
201  { "\xe6\x20", -1, 0, FALSE },
202  { "\xe7\x20", -1, 0, FALSE },
203  { "\xe8\x20", -1, 0, FALSE },
204  { "\xe9\x20", -1, 0, FALSE },
205  { "\xea\x20", -1, 0, FALSE },
206  { "\xeb\x20", -1, 0, FALSE },
207  { "\xec\x20", -1, 0, FALSE },
208  { "\xed\x20", -1, 0, FALSE },
209  { "\xee\x20", -1, 0, FALSE },
210  { "\xef\x20", -1, 0, FALSE },
211  { "\xf0\x20", -1, 0, FALSE },
212  { "\xf1\x20", -1, 0, FALSE },
213  { "\xf2\x20", -1, 0, FALSE },
214  { "\xf3\x20", -1, 0, FALSE },
215  { "\xf4\x20", -1, 0, FALSE },
216  { "\xf5\x20", -1, 0, FALSE },
217  { "\xf6\x20", -1, 0, FALSE },
218  { "\xf7\x20", -1, 0, FALSE },
219  { "\xf8\x20", -1, 0, FALSE },
220  { "\xf9\x20", -1, 0, FALSE },
221  { "\xfa\x20", -1, 0, FALSE },
222  { "\xfb\x20", -1, 0, FALSE },
223  { "\xfc\x20", -1, 0, FALSE },
224  { "\xfd\x20", -1, 0, FALSE },
225  /* missing continuation bytes */
226  { "\x20\xc0", -1, 1, FALSE },
227  { "\x20\xe0\x80", -1, 1, FALSE },
228  { "\x20\xf0\x80\x80", -1, 1, FALSE },
229  { "\x20\xf8\x80\x80\x80", -1, 1, FALSE },
230  { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE },
231  { "\x20\xdf", -1, 1, FALSE },
232  { "\x20\xef\xbf", -1, 1, FALSE },
233  { "\x20\xf7\xbf\xbf", -1, 1, FALSE },
234  { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE },
235  { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE },
236  /* impossible bytes */
237  { "\x20\xfe\x20", -1, 1, FALSE },
238  { "\x20\xff\x20", -1, 1, FALSE },
239  /* overlong sequences */
240  { "\x20\xc0\xaf\x20", -1, 1, FALSE },
241  { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
242  { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE },
243  { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE },
244  { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE },
245  { "\x20\xc1\xbf\x20", -1, 1, FALSE },
246  { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE },
247  { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE },
248  { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE },
249  { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE },
250  { "\x20\xc0\x80\x20", -1, 1, FALSE },
251  { "\x20\xe0\x80\x80\x20", -1, 1, FALSE },
252  { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
253  { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
254  { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
255  /* illegal code positions */
256  { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
257  { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
258  { "\x20\xed\xae\x80\x20", -1, 1, FALSE },
259  { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE },
260  { "\x20\xed\xb0\x80\x20", -1, 1, FALSE },
261  { "\x20\xed\xbe\x80\x20", -1, 1, FALSE },
262  { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE },
263  { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE },
264  { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
265  { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
266  { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
267  { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE },
268  { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
269  { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
270  { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
271  { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
272  { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
273
274  { NULL, }
275};
276
277static void
278do_test (gint         index,
279	 const gchar *text,
280	 gint         max_len,
281	 gint         offset,
282	 gboolean     valid)
283{
284  const gchar *end;
285  gboolean result;
286
287  result = g_utf8_validate (text, max_len, &end);
288
289  if (result != valid || end - text != offset)
290    {
291      GString *str;
292      const gchar *p;
293
294      any_failed = TRUE;
295
296      str = g_string_new (0);
297      for (p = text; *p; p++)
298	g_string_append_printf (str, "\\x%02hhx", *p);
299      g_print ("%d: g_utf8_validate (\"%s\", %d) failed, "
300	       "expected %s %d, got %s %d\n",
301	       index,
302	       str->str, max_len,
303	       valid ? "TRUE" : "FALSE", offset,
304	       result ? "TRUE" : "FALSE", (gint) (end - text));
305      g_string_free (str, FALSE);
306    }
307}
308
309int
310main (int argc, char *argv[])
311{
312  gint i;
313
314  for (i = 0; test[i].text; i++)
315    do_test (i, test[i].text, test[i].max_len,
316	     test[i].offset, test[i].valid);
317
318  return any_failed ? 1 : 0;
319}
320