10891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/* gutf8.c - Operations on UTF-8 strings. 20891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 30891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Copyright (C) 1999 Tom Tromey 40891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Copyright (C) 2000 Red Hat, Inc. 50891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 60891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * This library is free software; you can redistribute it and/or 7c9bd7542e1a28ba9de60048361c0a97d251833e7Tim Janik * modify it under the terms of the GNU Lesser General Public 80891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * License as published by the Free Software Foundation; either 90891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * version 2 of the License, or (at your option) any later version. 100891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * This library is distributed in the hope that it will be useful, 120891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * but WITHOUT ANY WARRANTY; without even the implied warranty of 130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14c9bd7542e1a28ba9de60048361c0a97d251833e7Tim Janik * Lesser General Public License for more details. 150891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 16c9bd7542e1a28ba9de60048361c0a97d251833e7Tim Janik * You should have received a copy of the GNU Lesser General Public 170891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * License along with this library; if not, write to the 180891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 190891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Boston, MA 02111-1307, USA. 200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor */ 210891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 22307391459ddd322bccee190d8ded32fb462e8c50Sebastian Wilhelmi#include "config.h" 230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include <stdlib.h> 250c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh#ifndef ANDROID_STUB 26960868332881b9ca1dc94baafa8aa42c07df3101Owen Taylor#ifdef HAVE_CODESET 270891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include <langinfo.h> 280891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#endif 29bb28d819383d1cbebb355153d2f53c858288835fJaikumar Ganesh#endif 300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include <string.h> 310891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 320891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include "glib.h" 330891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 34754d8ddad85ef7054bf262bbc57ee67a0278f493Tor Lillqvist#ifdef G_PLATFORM_WIN32 358a0df0a71c88c04d4fccbc6780b7105bc527261bTor Lillqvist#include <stdio.h> 368a0df0a71c88c04d4fccbc6780b7105bc527261bTor Lillqvist#define STRICT 374f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist#include <windows.h> 38754d8ddad85ef7054bf262bbc57ee67a0278f493Tor Lillqvist#undef STRICT 394f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist#endif 404f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist 410c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh#ifndef ANDROID_STUB 42b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor#include "libcharset/libcharset.h" 43bb28d819383d1cbebb355153d2f53c858288835fJaikumar Ganesh#endif 44b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 45b8796462fbed70f55219da6c3c8f6611de8f2a8cOwen Taylor#include "glibintl.h" 4648876d7fb573f2a3823c20564b705fe1f36726c4Matthias Clasen#include "galias.h" 47956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#define UTF8_COMPUTE(Char, Mask, Len) \ 490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (Char < 128) \ 500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 1; \ 520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x7f; \ 530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xe0) == 0xc0) \ 550891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 560891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 2; \ 570891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x1f; \ 580891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 590891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xf0) == 0xe0) \ 600891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 610891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 3; \ 620891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x0f; \ 630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xf8) == 0xf0) \ 650891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 4; \ 670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x07; \ 680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xfc) == 0xf8) \ 700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 5; \ 720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x03; \ 730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xfe) == 0xfc) \ 750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 760891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 6; \ 770891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x01; \ 780891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 790891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else \ 800891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = -1; 810891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 82956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor#define UTF8_LENGTH(Char) \ 83956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x80 ? 1 : \ 84956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x800 ? 2 : \ 85956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x10000 ? 3 : \ 86956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x200000 ? 4 : \ 87956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x4000000 ? 5 : 6))))) 88956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 89956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 900891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#define UTF8_GET(Result, Chars, Count, Mask, Len) \ 910891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) = (Chars)[0] & (Mask); \ 920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for ((Count) = 1; (Count) < (Len); ++(Count)) \ 930891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 940891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (((Chars)[(Count)] & 0xc0) != 0x80) \ 950891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) = -1; \ 970891c64816faaadc8e26f9eebb3205af11323473Owen Taylor break; \ 980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) <<= 6; \ 1000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) |= ((Chars)[(Count)] & 0x3f); \ 1010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 102956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 103956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor#define UNICODE_VALID(Char) \ 104956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x110000 && \ 105cb0e4de11c8132878acb9df9d8923eab73552363Matthias Clasen (((Char) & 0xFFFFF800) != 0xD800) && \ 106cb0e4de11c8132878acb9df9d8923eab73552363Matthias Clasen ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ 107bc76e98174d16e0575b439508c4378a62f2ba785Noah Levitt ((Char) & 0xFFFE) != 0xFFFE) 108956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 109956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 11049fb6c6cc28dad0049b8e5a94c2cd2762f75baabOwen Taylorstatic const gchar utf8_skip_data[256] = { 1110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1120891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1140891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1150891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1160891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1170891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 118f344cb2cc2ba645ebc27c007c44e6cf871e6beccDaniel Elstner 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 1190891c64816faaadc8e26f9eebb3205af11323473Owen Taylor}; 1200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 12149fb6c6cc28dad0049b8e5a94c2cd2762f75baabOwen Taylorconst gchar * const g_utf8_skip = utf8_skip_data; 12249fb6c6cc28dad0049b8e5a94c2cd2762f75baabOwen Taylor 1230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 1240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_find_prev_char: 12550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @str: pointer to the beginning of a UTF-8 encoded string 1260891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p: pointer to some position within @str 1270891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1280891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Given a position @p with a UTF-8 encoded string @str, find the start 1290891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * of the previous UTF-8 character starting before @p. Returns %NULL if no 1303afc87dd6290d11b2d9096b61fb943d2fd12d45bKang Jeong-Hee * UTF-8 characters are present in @str before @p. 1310891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 13250d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p does not have to be at the beginning of a UTF-8 character. No check 1330891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is made to see if the character found is actually valid other than 1340891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * it starts with an appropriate byte. 1350891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1360891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to the found character or %NULL. 1370891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 1380891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 1390891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_find_prev_char (const char *str, 1400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const char *p) 1410891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 142149a0fb2bdd8eeae8790f9fd758cc59ae59bcbb5Christopher James Lahey for (--p; p >= str; --p) 1430891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 1440891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if ((*p & 0xc0) != 0x80) 1450891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gchar *)p; 1460891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 1470891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return NULL; 1480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 1490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 1510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_find_next_char: 1520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p: a pointer to a position within a UTF-8 encoded string 1538b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * @end: a pointer to the byte following the end of the string, 1548b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * or %NULL to indicate that the string is nul-terminated. 1550891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 15650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Finds the start of the next UTF-8 character in the string after @p. 1570891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 15850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p does not have to be at the beginning of a UTF-8 character. No check 1590891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is made to see if the character found is actually valid other than 1600891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * it starts with an appropriate byte. 1610891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1620891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to the found character or %NULL 1630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 1640891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 1650891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_find_next_char (const gchar *p, 1660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *end) 1670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 1680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (*p) 1690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 1700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (end) 1710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for (++p; p < end && (*p & 0xc0) == 0x80; ++p) 1720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor ; 1730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else 1740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for (++p; (*p & 0xc0) == 0x80; ++p) 1750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor ; 1760891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 1770891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (p == end) ? NULL : (gchar *)p; 1780891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 1790891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1800891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 1810891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_prev_char: 1820891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p: a pointer to a position within a UTF-8 encoded string 1830891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 18450d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Finds the previous UTF-8 character in the string before @p. 1850891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1860891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p does not have to be at the beginning of a UTF-8 character. No check 1870891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is made to see if the character found is actually valid other than 1880891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * it starts with an appropriate byte. If @p might be the first 18950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * character of the string, you must use g_utf8_find_prev_char() instead. 1900891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1910891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to the found character. 1920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 1930891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 1940891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_prev_char (const gchar *p) 1950891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 1960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor while (TRUE) 1970891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 1980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor p--; 1990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if ((*p & 0xc0) != 0x80) 2000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gchar *)p; 2010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 2020891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 2030891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2040891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 2050891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_strlen: 20650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: pointer to the start of a UTF-8 encoded string. 2070891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @max: the maximum number of bytes to examine. If @max 2080891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is less than 0, then the string is assumed to be 209048002c8f74d8779f24085edcb790ee71cc93329Matthias Clasen * nul-terminated. If @max is 0, @p will not be examined and 210048002c8f74d8779f24085edcb790ee71cc93329Matthias Clasen * may be %NULL. 2110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 21250d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Returns the length of the string in characters. 21350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * 2140891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: the length of the string in characters 2154eab875811c415d894626b51818a447adfa1af71Havoc Pennington **/ 216f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorglong 21794b8df8ca039527044a66ea2cd3ab6ebf07b54b8Havoc Penningtong_utf8_strlen (const gchar *p, 218f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor gssize max) 2190891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 220f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len = 0; 2210891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *start = p; 222048002c8f74d8779f24085edcb790ee71cc93329Matthias Clasen g_return_val_if_fail (p != NULL || max == 0, 0); 223767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 224767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington if (max < 0) 225767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington { 226767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington while (*p) 227767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington { 228767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington p = g_utf8_next_char (p); 229767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington ++len; 230767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington } 231767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington } 232767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington else 2330891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 234767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington if (max == 0 || !*p) 235767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington return 0; 236767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 237767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington p = g_utf8_next_char (p); 238767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 239767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington while (p - start < max && *p) 240767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington { 241767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington ++len; 242767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington p = g_utf8_next_char (p); 243767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington } 244767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 245767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington /* only do the last len increment if we got a complete 246767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington * char (don't count partial chars) 247767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington */ 2488b3b3adb6d8158b296c66d9166fcbe4b0cc8e66cMatthias Clasen if (p - start <= max) 249767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington ++len; 2500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 251767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 2520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return len; 2530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 2540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2550891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 2560891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_get_char: 25750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a pointer to Unicode character encoded as UTF-8 2580891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 25950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 260ebec3d7ce7c2ca56ab1167403669a74bfe12b110Havoc Pennington * If @p does not point to a valid UTF-8 encoded character, results are 261f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * undefined. If you are not sure that the bytes are complete 26250d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * valid Unicode characters, you should use g_utf8_get_char_validated() 263f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * instead. 2640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 265ebec3d7ce7c2ca56ab1167403669a74bfe12b110Havoc Pennington * Return value: the resulting character 2660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 2670891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgunichar 2680891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_get_char (const gchar *p) 2690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 2700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor int i, mask = 0, len; 2710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gunichar result; 2720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor unsigned char c = (unsigned char) *p; 2730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor UTF8_COMPUTE (c, mask, len); 2750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (len == -1) 2760891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gunichar)-1; 2770891c64816faaadc8e26f9eebb3205af11323473Owen Taylor UTF8_GET (result, p, i, mask, len); 2780891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2790891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return result; 2800891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 2810891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2820891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 2830891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_offset_to_pointer: 2840891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @str: a UTF-8 encoded string 28550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @offset: a character offset within @str 286cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * 2870891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Converts from an integer character offset to a pointer to a position 2880891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * within the string. 289cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * 2901ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen * Since 2.10, this function allows to pass a negative @offset to 291f59aac306d4d1c9f334943aacacdffe404cf9759Matthias Clasen * step backwards. It is usually worth stepping backwards from the end 292cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * instead of forwards if @offset is in the last fourth of the string, 293f59aac306d4d1c9f334943aacacdffe404cf9759Matthias Clasen * since moving forward is about 3 times faster than moving backward. 294cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * 295cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * <note><para> 296cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * This function doesn't abort when reaching the end of @str. Therefore 297cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * you should be sure that @offset is within string boundaries before 298cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * calling that function. Call g_utf8_strlen() when unsure. 299cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * 300cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * This limitation exists as this function is called frequently during 301cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * text rendering and therefore has to be as fast as possible. 302cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * </para></note> 303cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann * 3040891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: the resulting pointer 3050891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 3060891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 3070891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_offset_to_pointer (const gchar *str, 308cf01757ba0bb6f3564dc6a5f9cb725e3beab8229Mathias Hasselmann glong offset) 3090891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 3100891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *s = str; 3111ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen 3121ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen if (offset > 0) 3131ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen while (offset--) 3141ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen s = g_utf8_next_char (s); 3151ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen else 3161ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen { 3171ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen const char *s1; 3181ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen 3191ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen /* This nice technique for fast backwards stepping 3201ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen * through a UTF-8 string was dubbed "stutter stepping" 3211ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen * by its inventor, Larry Ewing. 3221ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen */ 3231ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen while (offset) 3241ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen { 3251ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen s1 = s; 3261ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen s += offset; 3271ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen while ((*s & 0xc0) == 0x80) 3281ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen s--; 3291ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen 3301ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen offset += g_utf8_pointer_to_offset (s, s1); 3311ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen } 3321ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen } 3331ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen 3340891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gchar *)s; 3350891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 3360891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3370891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 3380891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_pointer_to_offset: 3390891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @str: a UTF-8 encoded string 3400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @pos: a pointer to a position within @str 3410891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 3420891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Converts from a pointer to position within a string to a integer 34350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * character offset. 3441ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen * 3451ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen * Since 2.10, this function allows @pos to be before @str, and returns 3461ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen * a negative offset in this case. 3470891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 3480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: the resulting character offset 3490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 350f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorglong 3510891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_pointer_to_offset (const gchar *str, 3520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *pos) 3530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 3540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *s = str; 355f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong offset = 0; 3560891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3571ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen if (pos < str) 3581ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen offset = - g_utf8_pointer_to_offset (pos, str); 3591ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen else 3601ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen while (s < pos) 3611ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen { 3621ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen s = g_utf8_next_char (s); 3631ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen offset++; 3641ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen } 3651ee0917984152f9fe09b33a3660ba96cec0b55b1Matthias Clasen 3660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return offset; 3670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 3680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3704eab875811c415d894626b51818a447adfa1af71Havoc Pennington/** 3714eab875811c415d894626b51818a447adfa1af71Havoc Pennington * g_utf8_strncpy: 3724eab875811c415d894626b51818a447adfa1af71Havoc Pennington * @dest: buffer to fill with characters from @src 37350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @src: UTF-8 encoded string 3744eab875811c415d894626b51818a447adfa1af71Havoc Pennington * @n: character count 3754eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 376a412fb16541620ed72da86daac0774afe4703d9dMatthias Clasen * Like the standard C strncpy() function, but 37750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * copies a given number of characters instead of a given number of 37850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * bytes. The @src string must be valid UTF-8 encoded text. 37950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * (Use g_utf8_validate() on all text before trying to use UTF-8 38050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * utility functions with it.) 3814eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 3824eab875811c415d894626b51818a447adfa1af71Havoc Pennington * Return value: @dest 3834eab875811c415d894626b51818a447adfa1af71Havoc Pennington **/ 3840891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 3856d7ee813037c8b1cda721f6b37297a5c89ff18c0Owen Taylorg_utf8_strncpy (gchar *dest, 3866d7ee813037c8b1cda721f6b37297a5c89ff18c0Owen Taylor const gchar *src, 3876d7ee813037c8b1cda721f6b37297a5c89ff18c0Owen Taylor gsize n) 3880891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 3890891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *s = src; 3900891c64816faaadc8e26f9eebb3205af11323473Owen Taylor while (n && *s) 3910891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 3920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor s = g_utf8_next_char(s); 3930891c64816faaadc8e26f9eebb3205af11323473Owen Taylor n--; 3940891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 3950891c64816faaadc8e26f9eebb3205af11323473Owen Taylor strncpy(dest, src, s - src); 3960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor dest[s - src] = 0; 3970891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return dest; 3980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 3990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 400b5fa5b9867eec91047a16d45f79888395cf89931Owen TaylorG_LOCK_DEFINE_STATIC (aliases); 4010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 402b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylorstatic GHashTable * 403b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylorget_alias_hash (void) 404b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor{ 405b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor static GHashTable *alias_hash = NULL; 406b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *aliases; 4070891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 408b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_LOCK (aliases); 4090891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 410b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (!alias_hash) 4110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 412b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_hash = g_hash_table_new (g_str_hash, g_str_equal); 413b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 414b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor aliases = _g_locale_get_charset_aliases (); 415b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor while (*aliases != '\0') 416b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor { 417b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *canonical; 418b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *alias; 419b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char **alias_array; 420b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor int count = 0; 421b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 422b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias = aliases; 423b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor aliases += strlen (aliases) + 1; 424b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor canonical = aliases; 425b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor aliases += strlen (aliases) + 1; 426b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 427b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array = g_hash_table_lookup (alias_hash, canonical); 428b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (alias_array) 429b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor { 430b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor while (alias_array[count]) 431b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor count++; 432b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor } 433b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 434b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array = g_renew (const char *, alias_array, count + 2); 435b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array[count] = alias; 436b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array[count + 1] = NULL; 437b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 438b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor g_hash_table_insert (alias_hash, (char *)canonical, alias_array); 439b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor } 4400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 441b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 442b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_UNLOCK (aliases); 443b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 444b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return alias_hash; 445b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor} 446b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 447b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor/* As an abuse of the alias table, the following routines gets 448b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor * the charsets that are aliases for the canonical name. 449b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor */ 4500c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh#ifndef ANDROID_STUB 451df3dd538e555ca1caeba4f507aea707916895a98Matthias ClasenG_GNUC_INTERNAL const char ** 452b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor_g_charset_get_aliases (const char *canonical_name) 453b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor{ 454b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor GHashTable *alias_hash = get_alias_hash (); 455b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 456b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return g_hash_table_lookup (alias_hash, canonical_name); 457b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor} 458bb28d819383d1cbebb355153d2f53c858288835fJaikumar Ganesh#endif 459b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 460b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylorstatic gboolean 461ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorg_utf8_get_charset_internal (const char *raw_data, 462ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor const char **a) 463b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor{ 464b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *charset = getenv("CHARSET"); 465b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 466b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && *charset) 4670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 468b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor *a = charset; 469b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 470b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && strstr (charset, "UTF-8")) 4710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return TRUE; 472b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor else 473b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return FALSE; 4740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 4750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 476b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor /* The libcharset code tries to be thread-safe without 477b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor * a lock, but has a memory leak and a missing memory 478b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor * barrier, so we lock for it 479b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor */ 4800c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh#ifndef ANDROID_STUB 481b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_LOCK (aliases); 482ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor charset = _g_locale_charset_unalias (raw_data); 483b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_UNLOCK (aliases); 484b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 485b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && *charset) 4864f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist { 487b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor *a = charset; 4884f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist 489b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && strstr (charset, "UTF-8")) 490b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return TRUE; 491b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor else 492b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return FALSE; 4934f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist } 4940c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh#endif 4954f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist 4960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor /* Assume this for compatibility at present. */ 497b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor *a = "US-ASCII"; 498b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 4990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return FALSE; 5000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 5010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 502ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylortypedef struct _GCharsetCache GCharsetCache; 503ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 504ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorstruct _GCharsetCache { 505ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor gboolean is_utf8; 506ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor gchar *raw; 507ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor gchar *charset; 508ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor}; 509ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 510ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorstatic void 511ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorcharset_cache_free (gpointer data) 512ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor{ 513ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor GCharsetCache *cache = data; 514ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->raw); 515ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->charset); 516ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache); 517ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor} 5180891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5194eab875811c415d894626b51818a447adfa1af71Havoc Pennington/** 5204eab875811c415d894626b51818a447adfa1af71Havoc Pennington * g_get_charset: 5214eab875811c415d894626b51818a447adfa1af71Havoc Pennington * @charset: return location for character set name 5224eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 5233ab1139aeff0f7d05f727d2a163a96a1c8cbb1f3Matthias Clasen * Obtains the character set for the <link linkend="setlocale">current 5243ab1139aeff0f7d05f727d2a163a96a1c8cbb1f3Matthias Clasen * locale</link>; you might use this character set as an argument to 5253ab1139aeff0f7d05f727d2a163a96a1c8cbb1f3Matthias Clasen * g_convert(), to convert from the current locale's encoding to some 5263ab1139aeff0f7d05f727d2a163a96a1c8cbb1f3Matthias Clasen * other encoding. (Frequently g_locale_to_utf8() and g_locale_from_utf8() 5273ab1139aeff0f7d05f727d2a163a96a1c8cbb1f3Matthias Clasen * are nice shortcuts, though.) 5284eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 529b60f5cf11bb91165d248dfdbf6546dcf4c353f8eTor Lillqvist * On Windows the character set returned by this function is the 530b60f5cf11bb91165d248dfdbf6546dcf4c353f8eTor Lillqvist * so-called system default ANSI code-page. That is the character set 531b60f5cf11bb91165d248dfdbf6546dcf4c353f8eTor Lillqvist * used by the "narrow" versions of C library and Win32 functions that 532b60f5cf11bb91165d248dfdbf6546dcf4c353f8eTor Lillqvist * handle file names. It might be different from the character set 533b60f5cf11bb91165d248dfdbf6546dcf4c353f8eTor Lillqvist * used by the C library's current locale. 534b60f5cf11bb91165d248dfdbf6546dcf4c353f8eTor Lillqvist * 5354eab875811c415d894626b51818a447adfa1af71Havoc Pennington * The return value is %TRUE if the locale's encoding is UTF-8, in that 5364eab875811c415d894626b51818a447adfa1af71Havoc Pennington * case you can perhaps avoid calling g_convert(). 5374eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 5384eab875811c415d894626b51818a447adfa1af71Havoc Pennington * The string returned in @charset is not allocated, and should not be 5394eab875811c415d894626b51818a447adfa1af71Havoc Pennington * freed. 5404eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 5414eab875811c415d894626b51818a447adfa1af71Havoc Pennington * Return value: %TRUE if the returned charset is UTF-8 5424eab875811c415d894626b51818a447adfa1af71Havoc Pennington **/ 5430891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgboolean 544f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorg_get_charset (G_CONST_RETURN char **charset) 5450891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 546ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; 547ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor GCharsetCache *cache = g_static_private_get (&cache_private); 548ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor const gchar *raw; 549ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 550ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor if (!cache) 5510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 552ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache = g_new0 (GCharsetCache, 1); 553ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_static_private_set (&cache_private, cache, charset_cache_free); 5540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 555ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 5560c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh#ifndef ANDROID_STUB 557ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor raw = _g_locale_charset_raw (); 558ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 559ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor if (!(cache->raw && strcmp (cache->raw, raw) == 0)) 560ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor { 561ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor const gchar *new_charset; 562ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 563ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->raw); 564ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->charset); 565ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache->raw = g_strdup (raw); 566ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset); 567ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache->charset = g_strdup (new_charset); 568ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor } 5690c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh#else 5700c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh cache->charset = g_strdup("UTF-8"); 5710c01eb3a7313a580fe34044bcdc3337d067c8a15Jaikumar Ganesh cache->is_utf8 = TRUE; 572bb28d819383d1cbebb355153d2f53c858288835fJaikumar Ganesh#endif 573ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor if (charset) 574ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor *charset = cache->charset; 575ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 576ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor return cache->is_utf8; 5770891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 5780891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5790891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/* unicode_strchr */ 5800891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5810891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 5820891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_unichar_to_utf8: 5838bd94d0eb85901f6429afa00c98d28b21baa85d9Behdad Esfahbod * @c: a Unicode character code 584da765af2bcbcf1718a49cb9634766e4884493f5dOwen Taylor * @outbuf: output buffer, must have at least 6 bytes of space. 58537e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor * If %NULL, the length will be computed and returned 58650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * and nothing will be written to @outbuf. 5870891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 58850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Converts a single character to UTF-8. 5890891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 5900891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: number of bytes written 5910891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 5920891c64816faaadc8e26f9eebb3205af11323473Owen Taylorint 593f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorg_unichar_to_utf8 (gunichar c, 594f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor gchar *outbuf) 5950891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 596efa05f88efe52b32aff01d3694b6ebe3cd9cfe6aRoss Burton /* If this gets modified, also update the copy in g_string_insert_unichar() */ 597f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor guint len = 0; 5980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor int first; 5990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor int i; 6000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 6010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (c < 0x80) 6020891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 6030891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0; 6040891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 1; 6050891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 6060891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x800) 6070891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 6080891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xc0; 6090891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 2; 6100891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 6110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x10000) 6120891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 6130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xe0; 6140891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 3; 6150891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 6160891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x200000) 6170891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 6180891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xf0; 6190891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 4; 6200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 6210891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x4000000) 6220891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 6230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xf8; 6240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 5; 6250891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 6260891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else 6270891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 6280891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xfc; 6290891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 6; 6300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 6310891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 63237e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor if (outbuf) 6330891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 63437e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor for (i = len - 1; i > 0; --i) 63537e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor { 63637e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor outbuf[i] = (c & 0x3f) | 0x80; 63737e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor c >>= 6; 63837e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor } 63937e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor outbuf[0] = c | first; 6400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 6410891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 6420891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return len; 6430891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 6440891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 6450891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 6460891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_strchr: 64750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a nul-terminated UTF-8 encoded string 64850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @p 6498bd94d0eb85901f6429afa00c98d28b21baa85d9Behdad Esfahbod * @c: a Unicode character 6500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 6518bd94d0eb85901f6429afa00c98d28b21baa85d9Behdad Esfahbod * Finds the leftmost occurrence of the given Unicode character 65250d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * in a UTF-8 encoded string, while limiting the search to @len bytes. 65350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * If @len is -1, allow unbounded search. 6540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 65550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Return value: %NULL if the string does not contain the character, 65650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * otherwise, a pointer to the start of the leftmost occurrence of 65750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * the character in the string. 6580891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 6590891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 660106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larssong_utf8_strchr (const char *p, 66116fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gssize len, 662106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larsson gunichar c) 6630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 6640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gchar ch[10]; 6650891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 66616fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gint charlen = g_unichar_to_utf8 (c, ch); 66716fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor ch[charlen] = '\0'; 6680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 66916fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor return g_strstr_len (p, len, ch); 6700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 6710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 672106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larsson 6730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 6740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_strrchr: 67550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a nul-terminated UTF-8 encoded string 67650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @p 6778bd94d0eb85901f6429afa00c98d28b21baa85d9Behdad Esfahbod * @c: a Unicode character 6780891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 6798bd94d0eb85901f6429afa00c98d28b21baa85d9Behdad Esfahbod * Find the rightmost occurrence of the given Unicode character 68050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * in a UTF-8 encoded string, while limiting the search to @len bytes. 68150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * If @len is -1, allow unbounded search. 6820891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 68350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Return value: %NULL if the string does not contain the character, 68450d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * otherwise, a pointer to the start of the rightmost occurrence of the 68550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * character in the string. 6860891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 6870891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 688106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larssong_utf8_strrchr (const char *p, 68916fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gssize len, 690106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larsson gunichar c) 6910891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 6920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gchar ch[10]; 6930891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 69416fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gint charlen = g_unichar_to_utf8 (c, ch); 69516fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor ch[charlen] = '\0'; 6960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 69716fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor return g_strrstr_len (p, len, ch); 6980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 6990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 7000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 701956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/* Like g_utf8_get_char, but take a maximum length 702956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * and return (gunichar)-2 on incomplete trailing character 703956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 704956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorstatic inline gunichar 705f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylorg_utf8_get_char_extended (const gchar *p, 706f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor gssize max_len) 707956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 708f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor guint i, len; 709956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = (guchar) *p; 710956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 711956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x80) 712956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 713956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return wc; 714956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 715956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xc0) 716956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 717956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 718956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 719956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xe0) 720956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 721956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 2; 722956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x1f; 723956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 724956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf0) 725956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 726956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 3; 727956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x0f; 728956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 729956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf8) 730956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 731956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 4; 732956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x07; 733956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 734956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xfc) 735956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 736956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 5; 737956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x03; 738956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 739956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xfe) 740956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 741956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 6; 742956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x01; 743956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 744956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 745956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 746956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 747956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 748956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 749956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (max_len >= 0 && len > max_len) 750956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 751956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 1; i < max_len; i++) 752956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 753956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if ((((guchar *)p)[i] & 0xc0) != 0x80) 754956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 755956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 756956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-2; 757956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 758956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 759956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 1; i < len; ++i) 760956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 761956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar ch = ((guchar *)p)[i]; 762956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 763956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if ((ch & 0xc0) != 0x80) 764956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 765956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (ch) 766956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 767956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 768956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-2; 769956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 770956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 771956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc <<= 6; 772956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc |= (ch & 0x3f); 773956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 774956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 775956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (UTF8_LENGTH(wc) != len) 776956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 777956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 778956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return wc; 779956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 780956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 7810891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 782f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * g_utf8_get_char_validated: 78350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a pointer to Unicode character encoded as UTF-8 784ed6b059bc28cbaa8cfdcf2698f8783e75077abf8Behdad Esfahbod * @max_len: the maximum number of bytes to read, or -1, for no maximum or 785ed6b059bc28cbaa8cfdcf2698f8783e75077abf8Behdad Esfahbod * if @p is nul-terminated 786f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * 78750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 788f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * This function checks for incomplete characters, for invalid characters 789f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * such as characters that are out of the range of Unicode, and for 790f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * overlong encodings of valid characters. 791f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * 792f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * Return value: the resulting character. If @p points to a partial 793b205c9267b85243a1081b0854d172bf25284fbabMatthias Clasen * sequence at the end of a string that could begin a valid 794ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen * character (or if @max_len is zero), returns (gunichar)-2; 795ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen * otherwise, if @p does not point to a valid UTF-8 encoded 796ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen * Unicode character, returns (gunichar)-1. 797f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor **/ 798f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylorgunichar 799f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylorg_utf8_get_char_validated (const gchar *p, 800f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor gssize max_len) 801f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor{ 802ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen gunichar result; 803ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen 804ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen if (max_len == 0) 805ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen return (gunichar)-2; 806ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen 807ffabd954ee2164b819a892c4a445291097de1a85Matthias Clasen result = g_utf8_get_char_extended (p, max_len); 808f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor 809f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor if (result & 0x80000000) 810f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor return result; 811f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor else if (!UNICODE_VALID (result)) 812f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor return (gunichar)-1; 813f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor else 814f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor return result; 815f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor} 816f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor 817f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor/** 818956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf8_to_ucs4_fast: 819956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-8 encoded string 8208b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * @len: the maximum length of @str to use, in bytes. If @len < 0, 8218b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * then the string is nul-terminated. 822956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store the number of characters in the 823956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * result, or %NULL. 824956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 8250891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Convert a string from UTF-8 to a 32-bit fixed width 826956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * representation as UCS-4, assuming valid UTF-8 input. 827956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This function is roughly twice as fast as g_utf8_to_ucs4() 828956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * but does no error checking on the input. 8290891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 8300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to a newly allocated UCS-4 string. 83150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * This value must be freed with g_free(). 8320891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 8330891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgunichar * 834956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf8_to_ucs4_fast (const gchar *str, 835f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 836f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written) 8370891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 838956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint j, charlen; 8390891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gunichar *result; 8400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gint n_chars, i; 8410891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *p; 842956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 843956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_return_val_if_fail (str != NULL, NULL); 844956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 845956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p = str; 846956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_chars = 0; 847956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (len < 0) 848956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 849956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (*p) 850956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 851956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p = g_utf8_next_char (p); 852956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ++n_chars; 853956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 854956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 855956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 856956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 857d4e0ae748ac3b123e0aff97f965bfffbe4046a3bOwen Taylor while (p < str + len && *p) 858956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 859956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p = g_utf8_next_char (p); 860956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ++n_chars; 861956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 862956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 8630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 864956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar, n_chars + 1); 8650891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 8660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor p = str; 8670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for (i=0; i < n_chars; i++) 8680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 869956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = ((unsigned char *)p)[0]; 870956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 871956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x80) 872956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 873956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = wc; 874956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p++; 875956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 876956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 877956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 878956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0xe0) 879956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 880956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 2; 881956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x1f; 882956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 883956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf0) 884956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 885956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 3; 886956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x0f; 887956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 888956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf8) 889956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 890956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 4; 891956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x07; 892956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 893956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xfc) 894956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 895956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 5; 896956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x03; 897956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 898956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 899956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 900956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 6; 901956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x01; 902956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 903956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 904956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (j = 1; j < charlen; j++) 905956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 906956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc <<= 6; 907956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc |= ((unsigned char *)p)[j] & 0x3f; 908956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 909956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 910956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = wc; 911956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p += charlen; 912956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 9130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 914956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = 0; 915956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 916956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 917956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = i; 918956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 919956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 920956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 921956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 922956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 923956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf8_to_ucs4: 924956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-8 encoded string 9258b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * @len: the maximum length of @str to use, in bytes. If @len < 0, 9268b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * then the string is nul-terminated. 927956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of bytes read, or %NULL. 928956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 929956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 930956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 931956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 932956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of characters written or %NULL. 933956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value here stored does not include the trailing 0 934956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. 935956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 936956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 937956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 938956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 939956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UTF-8 to a 32-bit fixed width 940956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * representation as UCS-4. A trailing 0 will be added to the 941956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * string after the converted text. 942956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 943956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UCS-4 string. 944956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 945956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 946956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 947956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 948956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar * 949956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf8_to_ucs4 (const gchar *str, 950f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 951f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 952f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 953956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 954956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 955956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar *result = NULL; 956956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n_chars, i; 957956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gchar *in; 958956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 959956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 960956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_chars = 0; 961956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || str + len - in > 0) && *in) 962956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 9633a042a8959501f9e90df41fc31e3167dd7aa6222Matthias Clasen gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); 964956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc & 0x80000000) 965956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 966956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc == (gunichar)-2) 967956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 968956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 969956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor break; 970956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 9719c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 9729c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Partial character sequence at end of input")); 973956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 974956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 9759c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 9769c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid byte sequence in conversion input")); 977956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 978956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 979956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 980956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 981956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_chars++; 982956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 983956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 984956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 985956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 986956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar, n_chars + 1); 987956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 988956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 989956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i=0; i < n_chars; i++) 990956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 991956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = g_utf8_get_char (in); 992956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 993956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 994956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = 0; 995956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 996956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 997956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = n_chars; 998956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 999956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1000956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1001956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 10020891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 10030891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return result; 10040891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 10050891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 100649c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington/** 1007ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * g_ucs4_to_utf8: 1008ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * @str: a UCS-4 encoded string 1009c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * @len: the maximum length (number of characters) of @str to use. 1010ae6300bd091efc1a105027caa711590a05d524fcBehdad Esfahbod * If @len < 0, then the string is nul-terminated. 10113a042a8959501f9e90df41fc31e3167dd7aa6222Matthias Clasen * @items_read: location to store number of characters read, or %NULL. 1012956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of bytes written or %NULL. 1013956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value here stored does not include the trailing 0 1014956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * byte. 1015956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1016956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1017956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1018956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1019ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * Convert a string from a 32-bit fixed width representation as UCS-4. 1020956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * to UTF-8. The result will be terminated with a 0 byte. 1021ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * 1022ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * Return value: a pointer to a newly allocated UTF-8 string. 1023956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1024956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 102579fa3efec28252251f38cf03aae5097cbe004c7fMatthias Clasen * @error set. In that case, @items_read will be 102679fa3efec28252251f38cf03aae5097cbe004c7fMatthias Clasen * set to the position of the first invalid input 102779fa3efec28252251f38cf03aae5097cbe004c7fMatthias Clasen * character. 1028ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor **/ 1029ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylorgchar * 1030956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_ucs4_to_utf8 (const gunichar *str, 1031f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1032f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1033f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1034956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1035ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor{ 1036ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor gint result_length; 1037956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *result = NULL; 1038956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *p; 1039ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor gint i; 1040ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1041ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor result_length = 0; 1042956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 0; len < 0 || i < len ; i++) 1043956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1044956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (!str[i]) 1045956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor break; 1046ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1047956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (str[i] >= 0x80000000) 1048956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 10499c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 10509c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Character out of range for UTF-8")); 1051956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1052956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1053956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1054956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result_length += UTF8_LENGTH (str[i]); 1055956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1056ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1057ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor result = g_malloc (result_length + 1); 1058ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor p = result; 1059ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1060956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor i = 0; 1061956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (p < result + result_length) 1062956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p += g_unichar_to_utf8 (str[i++], p); 1063ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1064ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor *p = '\0'; 1065ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1066956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1067956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = p - result; 1068956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1069956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1070956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1071956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = i; 1072956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1073956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 1074956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1075956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1076956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) 1077956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1078956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1079956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf16_to_utf8: 1080956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-16 encoded string 1081c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * @len: the maximum length (number of <type>gunichar2</type>) of @str to use. 1082ae6300bd091efc1a105027caa711590a05d524fcBehdad Esfahbod * If @len < 0, then the string is nul-terminated. 1083956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of words read, or %NULL. 1084956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1085956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 1086956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 1087956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 1088956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of bytes written, or %NULL. 1089956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value stored here does not include the trailing 1090956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 0 byte. 1091956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1092956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1093956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1094956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1095956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UTF-16 to UTF-8. The result will be 1096956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * terminated with a 0 byte. 1097a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * 1098a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * Note that the input is expected to be already in native endianness, 1099a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * an initial byte-order-mark character is not handled specially. 1100a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * g_convert() can be used to convert a byte buffer of UTF-16 data of 1101a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * ambiguous endianess. 1102956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1103956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UTF-8 string. 1104956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1105956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1106956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1107956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1108956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgchar * 1109956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf16_to_utf8 (const gunichar2 *str, 1110f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1111f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1112f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1113956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1114956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1115956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ 1116956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * are marked. 1117956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 1118956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gunichar2 *in; 1119956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *out; 1120956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *result = NULL; 1121956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n_bytes; 1122956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar high_surrogate; 1123956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 11249df1f4fcc73807dc0ff4bad96caaba36b97c5a15Behdad Esfahbod g_return_val_if_fail (str != NULL, NULL); 1125956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1126956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes = 0; 1127956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1128956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1129956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || in - str < len) && *in) 1130956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1131956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1132956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1133956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1134956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1135956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1136956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1137956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1138956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1139956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1140956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1141956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1142956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 11439c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 11449c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid sequence in conversion input")); 1145956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1146956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1147956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1148956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1149956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1150956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1151956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 11529c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 11539c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid sequence in conversion input")); 1154956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1155956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1156956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1157956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1158956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1159956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1160956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next1; 1161956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1162956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1163956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1164956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1165956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1166956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1167956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes += UTF8_LENGTH (wc); 1168956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1169956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next1: 1170956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1171956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1172956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1173956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate && !items_read) 1174956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 11759c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 11769c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Partial character sequence at end of input")); 1177956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1178956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1179956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1180956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /* At this point, everything is valid, and we just need to convert 1181956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 1182956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1183956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_malloc (n_bytes + 1); 1184956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1185956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1186956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out = result; 1187956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1188956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (out < result + n_bytes) 1189956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1190956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1191956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1192956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1193956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1194956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1195956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1196956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1197956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1198956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1199956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1200956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1201956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next2; 1202956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1203956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1204956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1205956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1206956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1207956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out += g_unichar_to_utf8 (wc, out); 1208956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1209956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next2: 1210956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1211956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1212956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1213956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1214956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *out = '\0'; 1215956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1216956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1217956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1218956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = out - result; 1219956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1220956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1221956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1222956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 1223956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1224956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 1225956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1226956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1227956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1228956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf16_to_ucs4: 1229956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-16 encoded string 1230c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * @len: the maximum length (number of <type>gunichar2</type>) of @str to use. 1231ae6300bd091efc1a105027caa711590a05d524fcBehdad Esfahbod * If @len < 0, then the string is nul-terminated. 1232956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of words read, or %NULL. 1233956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1234956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 1235956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 1236956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 1237956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of characters written, or %NULL. 1238956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value stored here does not include the trailing 1239956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 0 character. 1240956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1241956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1242956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1243956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1244956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UTF-16 to UCS-4. The result will be 1245ae6300bd091efc1a105027caa711590a05d524fcBehdad Esfahbod * nul-terminated. 1246956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1247956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UCS-4 string. 1248956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1249956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1250956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1251956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1252956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar * 1253956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf16_to_ucs4 (const gunichar2 *str, 1254f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1255f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1256f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1257956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1258956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1259956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gunichar2 *in; 1260956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *out; 1261956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *result = NULL; 1262956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n_bytes; 1263956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar high_surrogate; 1264956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 12659df1f4fcc73807dc0ff4bad96caaba36b97c5a15Behdad Esfahbod g_return_val_if_fail (str != NULL, NULL); 1266956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1267956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes = 0; 1268956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1269956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1270956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || in - str < len) && *in) 1271956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1272956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1273956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1274956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1275956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1276956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1277956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1278956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1279956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1280956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1281956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1282956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1283956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 12849c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 12859c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid sequence in conversion input")); 1286956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1287956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1288956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1289956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1290956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1291956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1292956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 12939c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 12949c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid sequence in conversion input")); 1295956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1296956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1297956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1298956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1299956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1300956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1301956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next1; 1302956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1303956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1304956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1305956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1306956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1307956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1308956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes += sizeof (gunichar); 1309956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1310956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next1: 1311956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1312956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1313956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1314956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate && !items_read) 1315956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 13169c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 13179c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Partial character sequence at end of input")); 1318956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1319956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1320956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1321956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /* At this point, everything is valid, and we just need to convert 1322956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 1323956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1324956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_malloc (n_bytes + 4); 1325956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1326956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1327956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out = result; 1328956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1329956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (out < result + n_bytes) 1330956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1331956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1332956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1333956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1334956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1335956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1336956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1337956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1338956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1339956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1340956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1341956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1342956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next2; 1343956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1344956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1345956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1346956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1347956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1348956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *(gunichar *)out = wc; 1349956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out += sizeof (gunichar); 1350956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1351956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next2: 1352956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1353956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1354956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1355956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1356956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *(gunichar *)out = 0; 1357956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1358956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1359956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1360956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = (out - result) / sizeof (gunichar); 1361956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1362956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1363956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1364956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 1365956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1366956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar *)result; 1367956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1368956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1369956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1370956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf8_to_utf16: 1371956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-8 encoded string 1372c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * @len: the maximum length (number of characters) of @str to use. 1373c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * If @len < 0, then the string is nul-terminated. 1374956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of bytes read, or %NULL. 1375956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1376956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 1377956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 1378956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 1379c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * @items_written: location to store number of <type>gunichar2</type> written, 1380c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * or %NULL. 1381c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * The value stored here does not include the trailing 0. 1382956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1383956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1384956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1385956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1386c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * Convert a string from UTF-8 to UTF-16. A 0 character will be 1387956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * added to the result after the converted text. 1388956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1389956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UTF-16 string. 1390956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1391956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1392956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1393956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1394956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar2 * 1395956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf8_to_utf16 (const gchar *str, 1396f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1397f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1398f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1399956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1400956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1401956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 *result = NULL; 1402956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n16; 1403956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gchar *in; 1404956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint i; 1405956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1406956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_return_val_if_fail (str != NULL, NULL); 1407956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1408956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1409956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 = 0; 1410956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || str + len - in > 0) && *in) 1411956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 14123a042a8959501f9e90df41fc31e3167dd7aa6222Matthias Clasen gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); 1413956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc & 0x80000000) 1414956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1415956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc == (gunichar)-2) 1416956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1417956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1418956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor break; 1419956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 14209c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 14219c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Partial character sequence at end of input")); 1422956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1423956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 14249c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 14259c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid byte sequence in conversion input")); 1426956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1427956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1428956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1429956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1430956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0xd800) 1431956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1432956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xe000) 1433956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 14349c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 14359c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid sequence in conversion input")); 1436956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1437956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1438956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1439956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x10000) 1440956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1441956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x110000) 1442956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 2; 1443956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1444956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 14459c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 14469c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Character out of range for UTF-16")); 1447956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1448956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1449956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1450956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1451956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 1452956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1453956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1454956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar2, n16 + 1); 1455956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1456956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1457956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 0; i < n16;) 1458956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1459956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = g_utf8_get_char (in); 1460956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1461956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x10000) 1462956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1463956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i++] = wc; 1464956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1465956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1466956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1467956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i++] = (wc - 0x10000) / 0x400 + 0xd800; 1468956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i++] = (wc - 0x10000) % 0x400 + 0xdc00; 1469956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1470956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1471956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 1472956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1473956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1474956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = 0; 1475956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1476956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1477956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = n16; 1478956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1479956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1480956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1481956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 1482956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1483956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 1484956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1485956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1486956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1487956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_ucs4_to_utf16: 1488956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UCS-4 encoded string 1489c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * @len: the maximum length (number of characters) of @str to use. 1490ae6300bd091efc1a105027caa711590a05d524fcBehdad Esfahbod * If @len < 0, then the string is nul-terminated. 1491956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of bytes read, or %NULL. 1492956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If an error occurs then the index of the invalid input 1493956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * is stored here. 1494c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * @items_written: location to store number of <type>gunichar2</type> 1495c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * written, or %NULL. The value stored here does not 1496c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * include the trailing 0. 1497956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1498956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1499956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1500956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1501c9387d90693bb2ba3007a44c1c1e0a8ae9242ebeMatthias Clasen * Convert a string from UCS-4 to UTF-16. A 0 character will be 1502956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * added to the result after the converted text. 1503956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1504956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UTF-16 string. 1505956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1506956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1507956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1508956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1509956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar2 * 1510956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_ucs4_to_utf16 (const gunichar *str, 1511f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1512f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1513f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1514956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1515956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1516956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 *result = NULL; 1517956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n16; 1518956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint i, j; 1519956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1520956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 = 0; 1521956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor i = 0; 1522956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || i < len) && str[i]) 1523956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1524956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = str[i]; 1525956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1526956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0xd800) 1527956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1528956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xe000) 1529956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 15309c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 15319c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Invalid sequence in conversion input")); 1532956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1533956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1534956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1535956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x10000) 1536956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1537956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x110000) 1538956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 2; 1539956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1540956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 15419c17697b56501d11b4c653432cc9e290347aa03eChristian Persch g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 15429c17697b56501d11b4c653432cc9e290347aa03eChristian Persch _("Character out of range for UTF-16")); 1543956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1544956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1545956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1546956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1547956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor i++; 1548956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1549956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1550956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar2, n16 + 1); 1551956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1552956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 0, j = 0; j < n16; i++) 1553956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1554956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = str[i]; 1555956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1556956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x10000) 1557956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1558956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j++] = wc; 1559956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1560956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1561956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1562956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j++] = (wc - 0x10000) / 0x400 + 0xd800; 1563956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j++] = (wc - 0x10000) % 0x400 + 0xdc00; 1564956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1565956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1566956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j] = 0; 1567956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1568956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1569956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = n16; 1570956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1571956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1572956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1573956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = i; 1574956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1575ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor return result; 1576ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor} 1577ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 157840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen#define CONTINUATION_CHAR \ 157940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen G_STMT_START { \ 158040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ 158140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; \ 158240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val <<= 6; \ 158340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val |= (*(guchar *)p) & 0x3f; \ 158440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } G_STMT_END 158540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 158640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenstatic const gchar * 158740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenfast_validate (const char *str) 158840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 158940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen{ 159040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar val = 0; 159140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar min = 0; 159240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *p; 159340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 159440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen for (p = str; *p; p++) 159540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 159640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (*(guchar *)p < 128) 159740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen /* done */; 159840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 159940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 160040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *last; 160140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 160240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen last = p; 160340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ 160440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 160540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) 160640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 160740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 160840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ 160940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 161040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 161140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 161240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 161340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ 161440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 161540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 11); 161640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x0f; 161740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto TWO_REMAINING; 161840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 161940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ 162040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 162140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 16); 162240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x07; 162340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 162440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 162540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 162640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 162740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 162840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 162940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen TWO_REMAINING: 163040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 163140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 163240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 163340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 163440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 163540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (val < min)) 163640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 163740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 163840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (!UNICODE_VALID(val))) 163940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 164040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 164140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 164240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen continue; 164340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 164440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen error: 164540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return last; 164640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 164740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 164840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 164940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return p; 165040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen} 165140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 165240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenstatic const gchar * 165340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenfast_validate_len (const char *str, 165440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gssize max_len) 165540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 165640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen{ 165740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar val = 0; 165840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar min = 0; 165940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *p; 166040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 1661e51854dd5d4bbc1bc2b6d0643e38e68513256058Paolo Borelli g_assert (max_len >= 0); 1662e51854dd5d4bbc1bc2b6d0643e38e68513256058Paolo Borelli 1663e51854dd5d4bbc1bc2b6d0643e38e68513256058Paolo Borelli for (p = str; ((p - str) < max_len) && *p; p++) 166440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 166540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (*(guchar *)p < 128) 166640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen /* done */; 166740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 166840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 166940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *last; 167040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 167140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen last = p; 167240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ 167340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 1674e51854dd5d4bbc1bc2b6d0643e38e68513256058Paolo Borelli if (G_UNLIKELY (max_len - (p - str) < 2)) 167540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 167640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 167740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) 167840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 167940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 168040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ 168140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 168240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 168340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 168440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 168540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ 168640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 1687e51854dd5d4bbc1bc2b6d0643e38e68513256058Paolo Borelli if (G_UNLIKELY (max_len - (p - str) < 3)) 168840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 168940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 169040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 11); 169140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x0f; 169240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto TWO_REMAINING; 169340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 169440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ 169540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 1696e51854dd5d4bbc1bc2b6d0643e38e68513256058Paolo Borelli if (G_UNLIKELY (max_len - (p - str) < 4)) 169740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 169840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 169940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 16); 170040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x07; 170140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 170240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 170340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 170440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 170540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 170640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 170740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen TWO_REMAINING: 170840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 170940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 171040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 171140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 171240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 171340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (val < min)) 171440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 171540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (!UNICODE_VALID(val))) 171640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 171740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 171840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 171940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen continue; 172040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 172140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen error: 172240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return last; 172340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 172440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 172540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 172640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return p; 172740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen} 172840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 1729ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor/** 173049c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * g_utf8_validate: 173149c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * @str: a pointer to character data 173240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * @max_len: max bytes to validate, or -1 to go until NUL 173349c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * @end: return location for end of valid data 173449c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * 173549c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * Validates UTF-8 encoded text. @str is the text to validate; 173649c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * if @str is nul-terminated, then @max_len can be -1, otherwise 173749c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * @max_len should be the number of bytes to validate. 1738ebec3d7ce7c2ca56ab1167403669a74bfe12b110Havoc Pennington * If @end is non-%NULL, then the end of the valid range 173940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * will be stored there (i.e. the start of the first invalid 174040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * character if some bytes were invalid, or the end of the text 174140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * being validated otherwise). 174240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * 174340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * Note that g_utf8_validate() returns %FALSE if @max_len is 174440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * positive and NUL is met before @max_len bytes have been read. 174549c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * 174650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 174750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * routines <emphasis>require</emphasis> valid UTF-8 as input; 174849c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * so data read from a file or the network should be checked 174949c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * with g_utf8_validate() before doing anything else with it. 175049c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * 175150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Return value: %TRUE if the text was valid UTF-8 175249c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington **/ 175349c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Penningtongboolean 175440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Claseng_utf8_validate (const char *str, 175540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gssize max_len, 175640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar **end) 175749c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington 175840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen{ 175949c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington const gchar *p; 1760956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 176140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (max_len < 0) 176240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p = fast_validate (str); 176340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 176440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p = fast_validate_len (str, max_len); 176549c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington 176649c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington if (end) 176749c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington *end = p; 1768b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington 176940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((max_len >= 0 && p != str + max_len) || 177040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen (max_len < 0 && *p != '\0')) 1771b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington return FALSE; 1772b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington else 1773b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington return TRUE; 177449c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington} 177549c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington 1776fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington/** 1777fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * g_unichar_validate: 1778fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * @ch: a Unicode character 1779fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * 1780fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * Checks whether @ch is a valid Unicode character. Some possible 1781fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * integer values of @ch will not be valid. 0 is considered a valid 1782fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * character, though it's normally a string terminator. 1783fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * 1784fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * Return value: %TRUE if @ch is a valid Unicode character 1785fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington **/ 1786fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Penningtongboolean 1787fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Penningtong_unichar_validate (gunichar ch) 1788fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington{ 1789fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington return UNICODE_VALID (ch); 1790fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington} 17911bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 17921bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen/** 17931bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * g_utf8_strreverse: 17941bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * @str: a UTF-8 encoded string 17958b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * @len: the maximum length of @str to use, in bytes. If @len < 0, 17968b4e4a13d81f2b11a88f5901031839b80233f213Behdad Esfahbod * then the string is nul-terminated. 17971bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * 17981bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 17991bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * (Use g_utf8_validate() on all text before trying to use UTF-8 18001bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * utility functions with it.) 18011bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * 1802137fdf9089bb448fb1a7ac86448268ead81573deMatthias Clasen * This function is intended for programmatic uses of reversed strings. 1803137fdf9089bb448fb1a7ac86448268ead81573deMatthias Clasen * It pays no attention to decomposed characters, combining marks, byte 1804137fdf9089bb448fb1a7ac86448268ead81573deMatthias Clasen * order marks, directional indicators (LRM, LRO, etc) and similar 1805137fdf9089bb448fb1a7ac86448268ead81573deMatthias Clasen * characters which might need special handling when reversing a string 1806137fdf9089bb448fb1a7ac86448268ead81573deMatthias Clasen * for display purposes. 1807137fdf9089bb448fb1a7ac86448268ead81573deMatthias Clasen * 18081bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * Note that unlike g_strreverse(), this function returns 18091bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * newly-allocated memory, which should be freed with g_free() when 18101bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * no longer needed. 18111bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * 18121bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * Returns: a newly-allocated string which is the reverse of @str. 1813a69dc4b65d07db32e200f1100bdeab898720c3c3Matthias Clasen * 1814a69dc4b65d07db32e200f1100bdeab898720c3c3Matthias Clasen * Since: 2.2 18151bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen */ 18161bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasengchar * 181763828ea746a2227f793937a242d7fbb9c09ec4daconst gchar *str,g_utf8_strreverse (const gchar *str, 18181bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen gssize len) 18191bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen{ 18201bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen gchar *r, *result; 18211bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen const gchar *p; 18221bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 18231bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen if (len < 0) 18241bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen len = strlen (str); 18251bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 18261bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen result = g_new (gchar, len + 1); 182763828ea746a2227f793937a242d7fbb9c09ec4dar > result) r = result + len; 18281bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen p = str; 182963828ea746a2227f793937a242d7fbb9c09ec4daguchar*) p]; while (r > result) 18301bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen { 18311bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen gchar *m, skip = g_utf8_skip[*(guchar*) p]; 18321bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen r -= skip; 18331bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen for (m = r; skip; skip--) 18341bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen *m++ = *p++; 18351bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen } 18361bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen result[len] = 0; 18371bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 1838608a31b98e1420f487190871ee7312db2643d93dMatthias Clasen return result; 18398f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen} 18408f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18418f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18428f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasengchar * 18438f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen_g_utf8_make_valid (const gchar *name) 18448f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen{ 18458f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen GString *string; 1846e1b7a25342364562fd43d9ef3c0a5e7586592dc9Paolo Borelli const gchar *remainder, *invalid; 1847e1b7a25342364562fd43d9ef3c0a5e7586592dc9Paolo Borelli gint remaining_bytes, valid_bytes; 1848e1b7a25342364562fd43d9ef3c0a5e7586592dc9Paolo Borelli 18498f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen g_return_val_if_fail (name != NULL, NULL); 18508f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18518f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen string = NULL; 1852e1b7a25342364562fd43d9ef3c0a5e7586592dc9Paolo Borelli remainder = name; 18538f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen remaining_bytes = strlen (name); 18548f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18558f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen while (remaining_bytes != 0) 18568f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen { 18578f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen if (g_utf8_validate (remainder, remaining_bytes, &invalid)) 18588f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen break; 18598f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen valid_bytes = invalid - remainder; 18608f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18618f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen if (string == NULL) 18628f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen string = g_string_sized_new (remaining_bytes); 18638f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18648f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen g_string_append_len (string, remainder, valid_bytes); 18658f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen /* append U+FFFD REPLACEMENT CHARACTER */ 18668f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen g_string_append (string, "\357\277\275"); 18678f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18688f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen remaining_bytes -= valid_bytes + 1; 18698f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen remainder = invalid + 1; 18708f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen } 18718f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18728f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen if (string == NULL) 18738f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen return g_strdup (name); 18748f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18758f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen g_string_append (string, remainder); 18768f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18778f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen g_assert (g_utf8_validate (string->str, -1, NULL)); 18788f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen 18798f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen return g_string_free (string, FALSE); 18808f05c1da81e240b9d838b807f2b87bad88d8019cMatthias Clasen} 1881608a31b98e1420f487190871ee7312db2643d93dMatthias Clasen 1882608a31b98e1420f487190871ee7312db2643d93dMatthias Clasen 1883#define __G_UTF8_C__ 1884#include "galiasdef.c" 1885