1#undef G_DISABLE_ASSERT 2#undef G_LOG_DOMAIN 3 4#include <stdarg.h> 5#include <stdio.h> 6#include <stdlib.h> 7#include <string.h> 8#include <glib.h> 9 10static gint exit_status = 0; 11 12static void 13croak (char *format, ...) 14{ 15 va_list va; 16 17 va_start (va, format); 18 vfprintf (stderr, format, va); 19 va_end (va); 20 21 exit (1); 22} 23 24static void 25fail (char *format, ...) 26{ 27 va_list va; 28 29 va_start (va, format); 30 vfprintf (stderr, format, va); 31 va_end (va); 32 33 exit_status |= 1; 34} 35 36typedef enum 37{ 38 VALID, 39 INCOMPLETE, 40 NOTUNICODE, 41 OVERLONG, 42 MALFORMED 43} Status; 44 45static gboolean 46ucs4_equal (gunichar *a, gunichar *b) 47{ 48 while (*a && *b && (*a == *b)) 49 { 50 a++; 51 b++; 52 } 53 54 return (*a == *b); 55} 56 57static gboolean 58utf16_equal (gunichar2 *a, gunichar2 *b) 59{ 60 while (*a && *b && (*a == *b)) 61 { 62 a++; 63 b++; 64 } 65 66 return (*a == *b); 67} 68 69static gint 70utf16_count (gunichar2 *a) 71{ 72 gint result = 0; 73 74 while (a[result]) 75 result++; 76 77 return result; 78} 79 80static void 81process (gint line, 82 gchar *utf8, 83 Status status, 84 gunichar *ucs4, 85 gint ucs4_len) 86{ 87 const gchar *end; 88 gboolean is_valid = g_utf8_validate (utf8, -1, &end); 89 GError *error = NULL; 90 glong items_read, items_written; 91 92 switch (status) 93 { 94 case VALID: 95 if (!is_valid) 96 { 97 fail ("line %d: valid but g_utf8_validate returned FALSE\n", line); 98 return; 99 } 100 break; 101 case NOTUNICODE: 102 case INCOMPLETE: 103 case OVERLONG: 104 case MALFORMED: 105 if (is_valid) 106 { 107 fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line); 108 return; 109 } 110 break; 111 } 112 113 if (status == INCOMPLETE) 114 { 115 gunichar *ucs4_result; 116 117 ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error); 118 119 if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT)) 120 { 121 fail ("line %d: incomplete input not properly detected\n", line); 122 return; 123 } 124 g_clear_error (&error); 125 126 ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error); 127 128 if (!ucs4_result || items_read == strlen (utf8)) 129 { 130 fail ("line %d: incomplete input not properly detected\n", line); 131 return; 132 } 133 134 g_free (ucs4_result); 135 } 136 137 if (status == VALID || status == NOTUNICODE) 138 { 139 gunichar *ucs4_result; 140 gchar *utf8_result; 141 142 ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error); 143 if (!ucs4_result) 144 { 145 fail ("line %d: conversion to ucs4 failed: %s\n", line, error->message); 146 return; 147 } 148 149 if (!ucs4_equal (ucs4_result, ucs4) || 150 items_read != strlen (utf8) || 151 items_written != ucs4_len) 152 { 153 fail ("line %d: results of conversion to ucs4 do not match expected.\n", line); 154 return; 155 } 156 157 g_free (ucs4_result); 158 159 ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written); 160 161 if (!ucs4_equal (ucs4_result, ucs4) || 162 items_written != ucs4_len) 163 { 164 fail ("line %d: results of conversion to ucs4 do not match expected.\n", line); 165 return; 166 } 167 168 utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error); 169 if (!utf8_result) 170 { 171 fail ("line %d: conversion back to utf8 failed: %s", line, error->message); 172 return; 173 } 174 175 if (strcmp (utf8_result, utf8) != 0 || 176 items_read != ucs4_len || 177 items_written != strlen (utf8)) 178 { 179 fail ("line %d: conversion back to utf8 did not match original\n", line); 180 return; 181 } 182 183 g_free (utf8_result); 184 g_free (ucs4_result); 185 } 186 187 if (status == VALID) 188 { 189 gunichar2 *utf16_expected_tmp; 190 gunichar2 *utf16_expected; 191 gunichar2 *utf16_from_utf8; 192 gunichar2 *utf16_from_ucs4; 193 gunichar *ucs4_result; 194 gsize bytes_written; 195 gint n_chars; 196 gchar *utf8_result; 197 198#if G_BYTE_ORDER == G_LITTLE_ENDIAN 199#define TARGET "UTF-16LE" 200#else 201#define TARGET "UTF-16" 202#endif 203 204 if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8", 205 NULL, &bytes_written, NULL))) 206 { 207 fail ("line %d: could not convert to UTF-16 via g_convert\n", line); 208 return; 209 } 210 211 /* zero-terminate and remove BOM 212 */ 213 n_chars = bytes_written / 2; 214 if (utf16_expected_tmp[0] == 0xfeff) /* BOM */ 215 { 216 n_chars--; 217 utf16_expected = g_new (gunichar2, n_chars + 1); 218 memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars); 219 } 220 else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */ 221 { 222 fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line); 223 return; 224 } 225 else 226 { 227 utf16_expected = g_new (gunichar2, n_chars + 1); 228 memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars); 229 } 230 231 utf16_expected[n_chars] = '\0'; 232 233 if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error))) 234 { 235 fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message); 236 return; 237 } 238 239 if (items_read != strlen (utf8) || 240 utf16_count (utf16_from_utf8) != items_written) 241 { 242 fail ("line %d: length error in conversion to ucs16\n", line); 243 return; 244 } 245 246 if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error))) 247 { 248 fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message); 249 return; 250 } 251 252 if (items_read != ucs4_len || 253 utf16_count (utf16_from_ucs4) != items_written) 254 { 255 fail ("line %d: length error in conversion to ucs16\n", line); 256 return; 257 } 258 259 if (!utf16_equal (utf16_from_utf8, utf16_expected) || 260 !utf16_equal (utf16_from_ucs4, utf16_expected)) 261 { 262 fail ("line %d: results of conversion to ucs16 do not match\n", line); 263 return; 264 } 265 266 if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error))) 267 { 268 fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message); 269 return; 270 } 271 272 if (items_read != utf16_count (utf16_from_utf8) || 273 items_written != strlen (utf8)) 274 { 275 fail ("line %d: length error in conversion from ucs16 to utf8\n", line); 276 return; 277 } 278 279 if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error))) 280 { 281 fail ("line %d: conversion back to utf8/ucs4 failed\n", line); 282 return; 283 } 284 285 if (items_read != utf16_count (utf16_from_utf8) || 286 items_written != ucs4_len) 287 { 288 fail ("line %d: length error in conversion from ucs16 to ucs4\n", line); 289 return; 290 } 291 292 if (strcmp (utf8, utf8_result) != 0 || 293 !ucs4_equal (ucs4, ucs4_result)) 294 { 295 fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line); 296 return; 297 } 298 299 g_free (utf16_expected_tmp); 300 g_free (utf16_expected); 301 g_free (utf16_from_utf8); 302 g_free (utf16_from_ucs4); 303 g_free (utf8_result); 304 g_free (ucs4_result); 305 } 306} 307 308int 309main (int argc, char **argv) 310{ 311 gchar *srcdir = getenv ("srcdir"); 312 gchar *testfile; 313 gchar *contents; 314 GError *error = NULL; 315 gchar *p, *end; 316 char *tmp; 317 gint state = 0; 318 gint line = 1; 319 gint start_line = 0; /* Quiet GCC */ 320 gchar *utf8 = NULL; /* Quiet GCC */ 321 GArray *ucs4; 322 Status status = VALID; /* Quiet GCC */ 323 324 if (!srcdir) 325 srcdir = "."; 326 327 testfile = g_strconcat (srcdir, G_DIR_SEPARATOR_S "utf8.txt", NULL); 328 329 g_file_get_contents (testfile, &contents, NULL, &error); 330 if (error) 331 croak ("Cannot open utf8.txt: %s", error->message); 332 333 ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar)); 334 335 p = contents; 336 337 /* Loop over lines */ 338 while (*p) 339 { 340 while (*p && (*p == ' ' || *p == '\t')) 341 p++; 342 343 end = p; 344 while (*end && (*end != '\r' && *end != '\n')) 345 end++; 346 347 if (!*p || *p == '#' || *p == '\r' || *p == '\n') 348 goto next_line; 349 350 tmp = g_strstrip (g_strndup (p, end - p)); 351 352 switch (state) 353 { 354 case 0: 355 /* UTF-8 string */ 356 start_line = line; 357 utf8 = tmp; 358 tmp = NULL; 359 break; 360 361 case 1: 362 /* Status */ 363 if (!strcmp (tmp, "VALID")) 364 status = VALID; 365 else if (!strcmp (tmp, "INCOMPLETE")) 366 status = INCOMPLETE; 367 else if (!strcmp (tmp, "NOTUNICODE")) 368 status = NOTUNICODE; 369 else if (!strcmp (tmp, "OVERLONG")) 370 status = OVERLONG; 371 else if (!strcmp (tmp, "MALFORMED")) 372 status = MALFORMED; 373 else 374 croak ("Invalid status on line %d\n", line); 375 376 if (status != VALID && status != NOTUNICODE) 377 state++; /* No UCS-4 data */ 378 379 break; 380 381 case 2: 382 /* UCS-4 version */ 383 384 p = strtok (tmp, " \t"); 385 while (p) 386 { 387 gchar *endptr; 388 389 gunichar ch = strtoul (p, &endptr, 16); 390 if (*endptr != '\0') 391 croak ("Invalid UCS-4 character on line %d\n", line); 392 393 g_array_append_val (ucs4, ch); 394 395 p = strtok (NULL, " \t"); 396 } 397 398 break; 399 } 400 401 g_free (tmp); 402 state = (state + 1) % 3; 403 404 if (state == 0) 405 { 406 process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len); 407 g_array_set_size (ucs4, 0); 408 g_free (utf8); 409 } 410 411 next_line: 412 p = end; 413 if (*p && *p == '\r') 414 p++; 415 if (*p && *p == '\n') 416 p++; 417 418 line++; 419 } 420 421 return exit_status; 422} 423