gutf8.c revision 608a31b98e1420f487190871ee7312db2643d93d
10891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/* gutf8.c - Operations on UTF-8 strings. 20891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 30891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Copyright (C) 1999 Tom Tromey 40891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Copyright (C) 2000 Red Hat, Inc. 50891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 60891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * This library is free software; you can redistribute it and/or 7c9bd7542e1a28ba9de60048361c0a97d251833e7Tim Janik * modify it under the terms of the GNU Lesser General Public 80891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * License as published by the Free Software Foundation; either 90891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * version 2 of the License, or (at your option) any later version. 100891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * This library is distributed in the hope that it will be useful, 120891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * but WITHOUT ANY WARRANTY; without even the implied warranty of 130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14c9bd7542e1a28ba9de60048361c0a97d251833e7Tim Janik * Lesser General Public License for more details. 150891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 16c9bd7542e1a28ba9de60048361c0a97d251833e7Tim Janik * You should have received a copy of the GNU Lesser General Public 170891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * License along with this library; if not, write to the 180891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 190891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Boston, MA 02111-1307, USA. 200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor */ 210891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 220891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include <config.h> 230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include <stdlib.h> 25960868332881b9ca1dc94baafa8aa42c07df3101Owen Taylor#ifdef HAVE_CODESET 260891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include <langinfo.h> 270891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#endif 280891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include <string.h> 290891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#include "glib.h" 31608a31b98e1420f487190871ee7312db2643d93dMatthias Clasen#include "galias.h" 320891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 33754d8ddad85ef7054bf262bbc57ee67a0278f493Tor Lillqvist#ifdef G_PLATFORM_WIN32 348a0df0a71c88c04d4fccbc6780b7105bc527261bTor Lillqvist#include <stdio.h> 358a0df0a71c88c04d4fccbc6780b7105bc527261bTor Lillqvist#define STRICT 364f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist#include <windows.h> 37754d8ddad85ef7054bf262bbc57ee67a0278f493Tor Lillqvist#undef STRICT 384f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist#endif 394f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist 40b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor#include "libcharset/libcharset.h" 41b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 42b8796462fbed70f55219da6c3c8f6611de8f2a8cOwen Taylor#include "glibintl.h" 43956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 440891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#define UTF8_COMPUTE(Char, Mask, Len) \ 450891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (Char < 128) \ 460891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 470891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 1; \ 480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x7f; \ 490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xe0) == 0xc0) \ 510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 2; \ 530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x1f; \ 540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 550891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xf0) == 0xe0) \ 560891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 570891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 3; \ 580891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x0f; \ 590891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 600891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xf8) == 0xf0) \ 610891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 620891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 4; \ 630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x07; \ 640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 650891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xfc) == 0xf8) \ 660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 5; \ 680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x03; \ 690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if ((Char & 0xfe) == 0xfc) \ 710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = 6; \ 730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Mask = 0x01; \ 740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else \ 760891c64816faaadc8e26f9eebb3205af11323473Owen Taylor Len = -1; 770891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 78956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor#define UTF8_LENGTH(Char) \ 79956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x80 ? 1 : \ 80956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x800 ? 2 : \ 81956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x10000 ? 3 : \ 82956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x200000 ? 4 : \ 83956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x4000000 ? 5 : 6))))) 84956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 85956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 860891c64816faaadc8e26f9eebb3205af11323473Owen Taylor#define UTF8_GET(Result, Chars, Count, Mask, Len) \ 870891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) = (Chars)[0] & (Mask); \ 880891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for ((Count) = 1; (Count) < (Len); ++(Count)) \ 890891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 900891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (((Chars)[(Count)] & 0xc0) != 0x80) \ 910891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { \ 920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) = -1; \ 930891c64816faaadc8e26f9eebb3205af11323473Owen Taylor break; \ 940891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } \ 950891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) <<= 6; \ 960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor (Result) |= ((Chars)[(Count)] & 0x3f); \ 970891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 98956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 99956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor#define UNICODE_VALID(Char) \ 100956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ((Char) < 0x110000 && \ 101cb0e4de11c8132878acb9df9d8923eab73552363Matthias Clasen (((Char) & 0xFFFFF800) != 0xD800) && \ 102cb0e4de11c8132878acb9df9d8923eab73552363Matthias Clasen ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ 103bc76e98174d16e0575b439508c4378a62f2ba785Noah Levitt ((Char) & 0xFFFE) != 0xFFFE) 104956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 105956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 10649fb6c6cc28dad0049b8e5a94c2cd2762f75baabOwen Taylorstatic const gchar utf8_skip_data[256] = { 1070891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1080891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1090891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1100891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1120891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 114f344cb2cc2ba645ebc27c007c44e6cf871e6beccDaniel Elstner 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 1150891c64816faaadc8e26f9eebb3205af11323473Owen Taylor}; 1160891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 11749fb6c6cc28dad0049b8e5a94c2cd2762f75baabOwen Taylorconst gchar * const g_utf8_skip = utf8_skip_data; 11849fb6c6cc28dad0049b8e5a94c2cd2762f75baabOwen Taylor 1190891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 1200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_find_prev_char: 12150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @str: pointer to the beginning of a UTF-8 encoded string 1220891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p: pointer to some position within @str 1230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Given a position @p with a UTF-8 encoded string @str, find the start 1250891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * of the previous UTF-8 character starting before @p. Returns %NULL if no 1260891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * UTF-8 characters are present in @p before @str. 1270891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 12850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p does not have to be at the beginning of a UTF-8 character. No check 1290891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is made to see if the character found is actually valid other than 1300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * it starts with an appropriate byte. 1310891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1320891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to the found character or %NULL. 1330891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 1340891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 1350891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_find_prev_char (const char *str, 1360891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const char *p) 1370891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 138149a0fb2bdd8eeae8790f9fd758cc59ae59bcbb5Christopher James Lahey for (--p; p >= str; --p) 1390891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 1400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if ((*p & 0xc0) != 0x80) 1410891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gchar *)p; 1420891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 1430891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return NULL; 1440891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 1450891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1460891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 1470891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_find_next_char: 1480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p: a pointer to a position within a UTF-8 encoded string 1490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @end: a pointer to the end of the string, or %NULL to indicate 15050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * that the string is nul-terminated, in which case 1510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * the returned value will be 1520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 15350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Finds the start of the next UTF-8 character in the string after @p. 1540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 15550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p does not have to be at the beginning of a UTF-8 character. No check 1560891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is made to see if the character found is actually valid other than 1570891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * it starts with an appropriate byte. 1580891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1590891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to the found character or %NULL 1600891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 1610891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 1620891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_find_next_char (const gchar *p, 1630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *end) 1640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 1650891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (*p) 1660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 1670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (end) 1680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for (++p; p < end && (*p & 0xc0) == 0x80; ++p) 1690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor ; 1700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else 1710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for (++p; (*p & 0xc0) == 0x80; ++p) 1720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor ; 1730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 1740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (p == end) ? NULL : (gchar *)p; 1750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 1760891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 1770891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 1780891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_prev_char: 1790891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p: a pointer to a position within a UTF-8 encoded string 1800891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 18150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Finds the previous UTF-8 character in the string before @p. 1820891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1830891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @p does not have to be at the beginning of a UTF-8 character. No check 1840891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is made to see if the character found is actually valid other than 1850891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * it starts with an appropriate byte. If @p might be the first 18650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * character of the string, you must use g_utf8_find_prev_char() instead. 1870891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 1880891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to the found character. 1890891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 1900891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 1910891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_prev_char (const gchar *p) 1920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 1930891c64816faaadc8e26f9eebb3205af11323473Owen Taylor while (TRUE) 1940891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 1950891c64816faaadc8e26f9eebb3205af11323473Owen Taylor p--; 1960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if ((*p & 0xc0) != 0x80) 1970891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gchar *)p; 1980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 1990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 2000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 2020891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_strlen: 20350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: pointer to the start of a UTF-8 encoded string. 2040891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @max: the maximum number of bytes to examine. If @max 2050891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * is less than 0, then the string is assumed to be 206048002c8f74d8779f24085edcb790ee71cc93329Matthias Clasen * nul-terminated. If @max is 0, @p will not be examined and 207048002c8f74d8779f24085edcb790ee71cc93329Matthias Clasen * may be %NULL. 2080891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 20950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Returns the length of the string in characters. 21050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * 2110891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: the length of the string in characters 2124eab875811c415d894626b51818a447adfa1af71Havoc Pennington **/ 213f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorglong 21494b8df8ca039527044a66ea2cd3ab6ebf07b54b8Havoc Penningtong_utf8_strlen (const gchar *p, 215f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor gssize max) 2160891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 217f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len = 0; 2180891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *start = p; 219048002c8f74d8779f24085edcb790ee71cc93329Matthias Clasen g_return_val_if_fail (p != NULL || max == 0, 0); 220767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 221767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington if (max < 0) 222767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington { 223767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington while (*p) 224767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington { 225767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington p = g_utf8_next_char (p); 226767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington ++len; 227767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington } 228767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington } 229767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington else 2300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 231767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington if (max == 0 || !*p) 232767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington return 0; 233767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 234767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington p = g_utf8_next_char (p); 235767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 236767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington while (p - start < max && *p) 237767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington { 238767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington ++len; 239767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington p = g_utf8_next_char (p); 240767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington } 241767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 242767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington /* only do the last len increment if we got a complete 243767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington * char (don't count partial chars) 244767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington */ 245767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington if (p - start == max) 246767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington ++len; 2470891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 248767800fcb36be28c1191ff24ca3ba99c4df4a07fHavoc Pennington 2490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return len; 2500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 2510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 2530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_get_char: 25450d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a pointer to Unicode character encoded as UTF-8 2550891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 25650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 257ebec3d7ce7c2ca56ab1167403669a74bfe12b110Havoc Pennington * If @p does not point to a valid UTF-8 encoded character, results are 258f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * undefined. If you are not sure that the bytes are complete 25950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * valid Unicode characters, you should use g_utf8_get_char_validated() 260f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * instead. 2610891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 262ebec3d7ce7c2ca56ab1167403669a74bfe12b110Havoc Pennington * Return value: the resulting character 2630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 2640891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgunichar 2650891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_get_char (const gchar *p) 2660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 2670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor int i, mask = 0, len; 2680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gunichar result; 2690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor unsigned char c = (unsigned char) *p; 2700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor UTF8_COMPUTE (c, mask, len); 2720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (len == -1) 2730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gunichar)-1; 2740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor UTF8_GET (result, p, i, mask, len); 2750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2760891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return result; 2770891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 2780891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2790891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 2800891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_offset_to_pointer: 2810891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @str: a UTF-8 encoded string 28250d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @offset: a character offset within @str 2830891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 2840891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Converts from an integer character offset to a pointer to a position 2850891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * within the string. 2860891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 2870891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: the resulting pointer 2880891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 2890891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 2900891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_offset_to_pointer (const gchar *str, 291f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong offset) 2920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 2930891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *s = str; 2940891c64816faaadc8e26f9eebb3205af11323473Owen Taylor while (offset--) 2950891c64816faaadc8e26f9eebb3205af11323473Owen Taylor s = g_utf8_next_char (s); 2960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 2970891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return (gchar *)s; 2980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 2990891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 3010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_pointer_to_offset: 3020891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @str: a UTF-8 encoded string 3030891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * @pos: a pointer to a position within @str 3040891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 3050891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Converts from a pointer to position within a string to a integer 30650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * character offset. 3070891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 3080891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: the resulting character offset 3090891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 310f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorglong 3110891c64816faaadc8e26f9eebb3205af11323473Owen Taylorg_utf8_pointer_to_offset (const gchar *str, 3120891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *pos) 3130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 3140891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *s = str; 315f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong offset = 0; 3160891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3170891c64816faaadc8e26f9eebb3205af11323473Owen Taylor while (s < pos) 3180891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 3190891c64816faaadc8e26f9eebb3205af11323473Owen Taylor s = g_utf8_next_char (s); 3200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor offset++; 3210891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 3220891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return offset; 3240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 3250891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3260891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 3274eab875811c415d894626b51818a447adfa1af71Havoc Pennington/** 3284eab875811c415d894626b51818a447adfa1af71Havoc Pennington * g_utf8_strncpy: 3294eab875811c415d894626b51818a447adfa1af71Havoc Pennington * @dest: buffer to fill with characters from @src 33050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @src: UTF-8 encoded string 3314eab875811c415d894626b51818a447adfa1af71Havoc Pennington * @n: character count 3324eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 333a412fb16541620ed72da86daac0774afe4703d9dMatthias Clasen * Like the standard C strncpy() function, but 33450d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * copies a given number of characters instead of a given number of 33550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * bytes. The @src string must be valid UTF-8 encoded text. 33650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * (Use g_utf8_validate() on all text before trying to use UTF-8 33750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * utility functions with it.) 3384eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 3394eab875811c415d894626b51818a447adfa1af71Havoc Pennington * Return value: @dest 3404eab875811c415d894626b51818a447adfa1af71Havoc Pennington **/ 3410891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 3426d7ee813037c8b1cda721f6b37297a5c89ff18c0Owen Taylorg_utf8_strncpy (gchar *dest, 3436d7ee813037c8b1cda721f6b37297a5c89ff18c0Owen Taylor const gchar *src, 3446d7ee813037c8b1cda721f6b37297a5c89ff18c0Owen Taylor gsize n) 3450891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 3460891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *s = src; 3470891c64816faaadc8e26f9eebb3205af11323473Owen Taylor while (n && *s) 3480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 3490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor s = g_utf8_next_char(s); 3500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor n--; 3510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 3520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor strncpy(dest, src, s - src); 3530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor dest[s - src] = 0; 3540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return dest; 3550891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 3560891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 357b5fa5b9867eec91047a16d45f79888395cf89931Owen TaylorG_LOCK_DEFINE_STATIC (aliases); 3580891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 359b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylorstatic GHashTable * 360b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylorget_alias_hash (void) 361b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor{ 362b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor static GHashTable *alias_hash = NULL; 363b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *aliases; 3640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 365b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_LOCK (aliases); 3660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 367b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (!alias_hash) 3680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 369b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_hash = g_hash_table_new (g_str_hash, g_str_equal); 370b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 371b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor aliases = _g_locale_get_charset_aliases (); 372b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor while (*aliases != '\0') 373b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor { 374b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *canonical; 375b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *alias; 376b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char **alias_array; 377b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor int count = 0; 378b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 379b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias = aliases; 380b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor aliases += strlen (aliases) + 1; 381b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor canonical = aliases; 382b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor aliases += strlen (aliases) + 1; 383b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 384b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array = g_hash_table_lookup (alias_hash, canonical); 385b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (alias_array) 386b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor { 387b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor while (alias_array[count]) 388b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor count++; 389b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor } 390b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 391b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array = g_renew (const char *, alias_array, count + 2); 392b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array[count] = alias; 393b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor alias_array[count + 1] = NULL; 394b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 395b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor g_hash_table_insert (alias_hash, (char *)canonical, alias_array); 396b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor } 3970891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 398b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 399b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_UNLOCK (aliases); 400b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 401b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return alias_hash; 402b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor} 403b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 404b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor/* As an abuse of the alias table, the following routines gets 405b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor * the charsets that are aliases for the canonical name. 406b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor */ 407b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylorconst char ** 408b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor_g_charset_get_aliases (const char *canonical_name) 409b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor{ 410b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor GHashTable *alias_hash = get_alias_hash (); 411b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 412b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return g_hash_table_lookup (alias_hash, canonical_name); 413b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor} 414b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 415b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylorstatic gboolean 416ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorg_utf8_get_charset_internal (const char *raw_data, 417ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor const char **a) 418b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor{ 419b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor const char *charset = getenv("CHARSET"); 420b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 421b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && *charset) 4220891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 423b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor *a = charset; 424b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 425b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && strstr (charset, "UTF-8")) 4260891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return TRUE; 427b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor else 428b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return FALSE; 4290891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 4300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 431b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor /* The libcharset code tries to be thread-safe without 432b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor * a lock, but has a memory leak and a missing memory 433b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor * barrier, so we lock for it 434b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor */ 435b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_LOCK (aliases); 436ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor charset = _g_locale_charset_unalias (raw_data); 437b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor G_UNLOCK (aliases); 438b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 439b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && *charset) 4404f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist { 441b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor *a = charset; 4424f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist 443b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor if (charset && strstr (charset, "UTF-8")) 444b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return TRUE; 445b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor else 446b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor return FALSE; 4474f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist } 4484f9e04c1d78ffe95f60101bea8381c6be5575069Tor Lillqvist 4490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor /* Assume this for compatibility at present. */ 450b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor *a = "US-ASCII"; 451b5fa5b9867eec91047a16d45f79888395cf89931Owen Taylor 4520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return FALSE; 4530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 4540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 455ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylortypedef struct _GCharsetCache GCharsetCache; 456ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 457ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorstruct _GCharsetCache { 458ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor gboolean is_utf8; 459ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor gchar *raw; 460ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor gchar *charset; 461ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor}; 462ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 463ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorstatic void 464ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylorcharset_cache_free (gpointer data) 465ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor{ 466ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor GCharsetCache *cache = data; 467ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->raw); 468ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->charset); 469ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache); 470ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor} 4710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 4724eab875811c415d894626b51818a447adfa1af71Havoc Pennington/** 4734eab875811c415d894626b51818a447adfa1af71Havoc Pennington * g_get_charset: 4744eab875811c415d894626b51818a447adfa1af71Havoc Pennington * @charset: return location for character set name 4754eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 4764eab875811c415d894626b51818a447adfa1af71Havoc Pennington * Obtains the character set for the current locale; you might use 4774eab875811c415d894626b51818a447adfa1af71Havoc Pennington * this character set as an argument to g_convert(), to convert from 4784eab875811c415d894626b51818a447adfa1af71Havoc Pennington * the current locale's encoding to some other encoding. (Frequently 4794eab875811c415d894626b51818a447adfa1af71Havoc Pennington * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, 4804eab875811c415d894626b51818a447adfa1af71Havoc Pennington * though.) 4814eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 4824eab875811c415d894626b51818a447adfa1af71Havoc Pennington * The return value is %TRUE if the locale's encoding is UTF-8, in that 4834eab875811c415d894626b51818a447adfa1af71Havoc Pennington * case you can perhaps avoid calling g_convert(). 4844eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 4854eab875811c415d894626b51818a447adfa1af71Havoc Pennington * The string returned in @charset is not allocated, and should not be 4864eab875811c415d894626b51818a447adfa1af71Havoc Pennington * freed. 4874eab875811c415d894626b51818a447adfa1af71Havoc Pennington * 4884eab875811c415d894626b51818a447adfa1af71Havoc Pennington * Return value: %TRUE if the returned charset is UTF-8 4894eab875811c415d894626b51818a447adfa1af71Havoc Pennington **/ 4900891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgboolean 491f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorg_get_charset (G_CONST_RETURN char **charset) 4920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 493ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; 494ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor GCharsetCache *cache = g_static_private_get (&cache_private); 495ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor const gchar *raw; 496ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 497ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor if (!cache) 4980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 499ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache = g_new0 (GCharsetCache, 1); 500ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_static_private_set (&cache_private, cache, charset_cache_free); 5010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 502ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 503ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor raw = _g_locale_charset_raw (); 504ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 505ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor if (!(cache->raw && strcmp (cache->raw, raw) == 0)) 506ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor { 507ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor const gchar *new_charset; 508ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 509ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->raw); 510ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor g_free (cache->charset); 511ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache->raw = g_strdup (raw); 512ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset); 513ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor cache->charset = g_strdup (new_charset); 514ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor } 515ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 516ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor if (charset) 517ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor *charset = cache->charset; 518ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor 519ceb35b237bb7538c0defd3acc13f95af3081495eOwen Taylor return cache->is_utf8; 5200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 5210891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5220891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/* unicode_strchr */ 5230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 5250891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_unichar_to_utf8: 526da765af2bcbcf1718a49cb9634766e4884493f5dOwen Taylor * @c: a ISO10646 character code 527da765af2bcbcf1718a49cb9634766e4884493f5dOwen Taylor * @outbuf: output buffer, must have at least 6 bytes of space. 52837e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor * If %NULL, the length will be computed and returned 52950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * and nothing will be written to @outbuf. 5300891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 53150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Converts a single character to UTF-8. 5320891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 5330891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: number of bytes written 5340891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 5350891c64816faaadc8e26f9eebb3205af11323473Owen Taylorint 536f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylorg_unichar_to_utf8 (gunichar c, 537f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor gchar *outbuf) 5380891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 539f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor guint len = 0; 5400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor int first; 5410891c64816faaadc8e26f9eebb3205af11323473Owen Taylor int i; 5420891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5430891c64816faaadc8e26f9eebb3205af11323473Owen Taylor if (c < 0x80) 5440891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 5450891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0; 5460891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 1; 5470891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 5480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x800) 5490891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 5500891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xc0; 5510891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 2; 5520891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 5530891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x10000) 5540891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 5550891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xe0; 5560891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 3; 5570891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 5580891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x200000) 5590891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 5600891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xf0; 5610891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 4; 5620891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 5630891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else if (c < 0x4000000) 5640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 5650891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xf8; 5660891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 5; 5670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 5680891c64816faaadc8e26f9eebb3205af11323473Owen Taylor else 5690891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 5700891c64816faaadc8e26f9eebb3205af11323473Owen Taylor first = 0xfc; 5710891c64816faaadc8e26f9eebb3205af11323473Owen Taylor len = 6; 5720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 5730891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 57437e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor if (outbuf) 5750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 57637e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor for (i = len - 1; i > 0; --i) 57737e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor { 57837e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor outbuf[i] = (c & 0x3f) | 0x80; 57937e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor c >>= 6; 58037e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor } 58137e7118821a81f524931d8a4fa8d7815dd82eb5eOwen Taylor outbuf[0] = c | first; 5820891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 5830891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5840891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return len; 5850891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 5860891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 5870891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 5880891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_strchr: 58950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a nul-terminated UTF-8 encoded string 59050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @p 59150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @c: a ISO10646 character 5920891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 59350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Finds the leftmost occurrence of the given ISO10646 character 59450d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * in a UTF-8 encoded string, while limiting the search to @len bytes. 59550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * If @len is -1, allow unbounded search. 5960891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 59750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Return value: %NULL if the string does not contain the character, 59850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * otherwise, a pointer to the start of the leftmost occurrence of 59950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * the character in the string. 6000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 6010891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 602106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larssong_utf8_strchr (const char *p, 60316fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gssize len, 604106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larsson gunichar c) 6050891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 6060891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gchar ch[10]; 6070891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 60816fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gint charlen = g_unichar_to_utf8 (c, ch); 60916fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor ch[charlen] = '\0'; 6100891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 61116fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor return g_strstr_len (p, len, ch); 6120891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 6130891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 614106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larsson 6150891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 6160891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * g_utf8_strrchr: 61750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a nul-terminated UTF-8 encoded string 61850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @p 61950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @c: a ISO10646 character 6200891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 62150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Find the rightmost occurrence of the given ISO10646 character 62250d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * in a UTF-8 encoded string, while limiting the search to @len bytes. 62350d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * If @len is -1, allow unbounded search. 6240891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 62550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Return value: %NULL if the string does not contain the character, 62650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * otherwise, a pointer to the start of the rightmost occurrence of the 62750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * character in the string. 6280891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 6290891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgchar * 630106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larssong_utf8_strrchr (const char *p, 63116fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gssize len, 632106fb627f1dc5c51a8fb759702344ae6f08d60c7Alex Larsson gunichar c) 6330891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 6340891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gchar ch[10]; 6350891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 63616fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor gint charlen = g_unichar_to_utf8 (c, ch); 63716fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor ch[charlen] = '\0'; 6380891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 63916fc3b22c0ae7ca268f34d554da4f4850748d335Owen Taylor return g_strrstr_len (p, len, ch); 6400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 6410891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 6420891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 643956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/* Like g_utf8_get_char, but take a maximum length 644956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * and return (gunichar)-2 on incomplete trailing character 645956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 646956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorstatic inline gunichar 647f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylorg_utf8_get_char_extended (const gchar *p, 648f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor gssize max_len) 649956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 650f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor guint i, len; 651956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = (guchar) *p; 652956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 653956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x80) 654956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 655956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return wc; 656956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 657956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xc0) 658956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 659956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 660956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 661956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xe0) 662956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 663956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 2; 664956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x1f; 665956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 666956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf0) 667956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 668956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 3; 669956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x0f; 670956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 671956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf8) 672956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 673956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 4; 674956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x07; 675956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 676956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xfc) 677956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 678956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 5; 679956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x03; 680956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 681956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xfe) 682956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 683956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor len = 6; 684956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x01; 685956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 686956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 687956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 688956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 689956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 690956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 691956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (max_len >= 0 && len > max_len) 692956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 693956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 1; i < max_len; i++) 694956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 695956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if ((((guchar *)p)[i] & 0xc0) != 0x80) 696956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 697956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 698956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-2; 699956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 700956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 701956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 1; i < len; ++i) 702956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 703956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar ch = ((guchar *)p)[i]; 704956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 705956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if ((ch & 0xc0) != 0x80) 706956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 707956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (ch) 708956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 709956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 710956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-2; 711956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 712956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 713956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc <<= 6; 714956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc |= (ch & 0x3f); 715956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 716956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 717956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (UTF8_LENGTH(wc) != len) 718956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar)-1; 719956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 720956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return wc; 721956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 722956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 7230891c64816faaadc8e26f9eebb3205af11323473Owen Taylor/** 724f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * g_utf8_get_char_validated: 72550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @p: a pointer to Unicode character encoded as UTF-8 726f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * @max_len: the maximum number of bytes to read, or -1, for no maximum. 727f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * 72850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 729f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * This function checks for incomplete characters, for invalid characters 730f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * such as characters that are out of the range of Unicode, and for 731f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * overlong encodings of valid characters. 732f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * 733f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor * Return value: the resulting character. If @p points to a partial 734b205c9267b85243a1081b0854d172bf25284fbabMatthias Clasen * sequence at the end of a string that could begin a valid 735b205c9267b85243a1081b0854d172bf25284fbabMatthias Clasen * character, returns (gunichar)-2; otherwise, if @p does not point 736b205c9267b85243a1081b0854d172bf25284fbabMatthias Clasen * to a valid UTF-8 encoded Unicode character, returns (gunichar)-1. 737f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor **/ 738f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylorgunichar 739f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylorg_utf8_get_char_validated (const gchar *p, 740f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor gssize max_len) 741f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor{ 742f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor gunichar result = g_utf8_get_char_extended (p, max_len); 743f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor 744f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor if (result & 0x80000000) 745f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor return result; 746f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor else if (!UNICODE_VALID (result)) 747f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor return (gunichar)-1; 748f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor else 749f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor return result; 750f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor} 751f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor 752f37c13dbde8f9d21a95c86c019b27d7dcaa0f4f9Owen Taylor/** 753956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf8_to_ucs4_fast: 754956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-8 encoded string 75550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 75650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * the string is nul-terminated. 757956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store the number of characters in the 758956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * result, or %NULL. 759956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 7600891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Convert a string from UTF-8 to a 32-bit fixed width 761956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * representation as UCS-4, assuming valid UTF-8 input. 762956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This function is roughly twice as fast as g_utf8_to_ucs4() 763956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * but does no error checking on the input. 7640891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * 7650891c64816faaadc8e26f9eebb3205af11323473Owen Taylor * Return value: a pointer to a newly allocated UCS-4 string. 76650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * This value must be freed with g_free(). 7670891c64816faaadc8e26f9eebb3205af11323473Owen Taylor **/ 7680891c64816faaadc8e26f9eebb3205af11323473Owen Taylorgunichar * 769956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf8_to_ucs4_fast (const gchar *str, 770f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 771f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written) 7720891c64816faaadc8e26f9eebb3205af11323473Owen Taylor{ 773956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint j, charlen; 7740891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gunichar *result; 7750891c64816faaadc8e26f9eebb3205af11323473Owen Taylor gint n_chars, i; 7760891c64816faaadc8e26f9eebb3205af11323473Owen Taylor const gchar *p; 777956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 778956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_return_val_if_fail (str != NULL, NULL); 779956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 780956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p = str; 781956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_chars = 0; 782956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (len < 0) 783956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 784956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (*p) 785956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 786956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p = g_utf8_next_char (p); 787956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ++n_chars; 788956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 789956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 790956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 791956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 792d4e0ae748ac3b123e0aff97f965bfffbe4046a3bOwen Taylor while (p < str + len && *p) 793956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 794956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p = g_utf8_next_char (p); 795956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor ++n_chars; 796956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 797956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 7980891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 799956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar, n_chars + 1); 8000891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 8010891c64816faaadc8e26f9eebb3205af11323473Owen Taylor p = str; 8020891c64816faaadc8e26f9eebb3205af11323473Owen Taylor for (i=0; i < n_chars; i++) 8030891c64816faaadc8e26f9eebb3205af11323473Owen Taylor { 804956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = ((unsigned char *)p)[0]; 805956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 806956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x80) 807956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 808956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = wc; 809956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p++; 810956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 811956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 812956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 813956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0xe0) 814956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 815956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 2; 816956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x1f; 817956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 818956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf0) 819956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 820956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 3; 821956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x0f; 822956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 823956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xf8) 824956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 825956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 4; 826956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x07; 827956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 828956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xfc) 829956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 830956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 5; 831956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x03; 832956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 833956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 834956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 835956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor charlen = 6; 836956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc &= 0x01; 837956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 838956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 839956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (j = 1; j < charlen; j++) 840956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 841956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc <<= 6; 842956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc |= ((unsigned char *)p)[j] & 0x3f; 843956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 844956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 845956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = wc; 846956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p += charlen; 847956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 8480891c64816faaadc8e26f9eebb3205af11323473Owen Taylor } 849956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = 0; 850956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 851956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 852956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = i; 853956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 854956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 855956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 856956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 857956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 858956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf8_to_ucs4: 859956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-8 encoded string 86050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 86150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * the string is nul-terminated. 862956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of bytes read, or %NULL. 863956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 864956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 865956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 866956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 867956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of characters written or %NULL. 868956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value here stored does not include the trailing 0 869956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. 870956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 871956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 872956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 873956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 874956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UTF-8 to a 32-bit fixed width 875956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * representation as UCS-4. A trailing 0 will be added to the 876956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * string after the converted text. 877956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 878956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UCS-4 string. 879956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 880956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 881956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 882956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 883956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar * 884956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf8_to_ucs4 (const gchar *str, 885f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 886f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 887f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 888956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 889956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 890956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar *result = NULL; 891956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n_chars, i; 892956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gchar *in; 893956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 894956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 895956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_chars = 0; 896956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || str + len - in > 0) && *in) 897956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 898956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = g_utf8_get_char_extended (in, str + len - in); 899956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc & 0x80000000) 900956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 901956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc == (gunichar)-2) 902956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 903956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 904956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor break; 905956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 906956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 907956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Partial character sequence at end of input")); 908956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 909956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 910956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 911956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid byte sequence in conversion input")); 912956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 913956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 914956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 915956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 916956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_chars++; 917956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 918956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 919956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 920956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 921956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar, n_chars + 1); 922956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 923956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 924956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i=0; i < n_chars; i++) 925956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 926956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = g_utf8_get_char (in); 927956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 928956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 929956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = 0; 930956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 931956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 932956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = n_chars; 933956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 934956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 935956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 936956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 9370891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 9380891c64816faaadc8e26f9eebb3205af11323473Owen Taylor return result; 9390891c64816faaadc8e26f9eebb3205af11323473Owen Taylor} 9400891c64816faaadc8e26f9eebb3205af11323473Owen Taylor 94149c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington/** 942ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * g_ucs4_to_utf8: 943ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * @str: a UCS-4 encoded string 94450d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 94550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * the string is terminated with a 0 character. 946956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of characters read read, or %NULL. 947956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of bytes written or %NULL. 948956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value here stored does not include the trailing 0 949956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * byte. 950956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 951956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 952956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 953956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 954ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * Convert a string from a 32-bit fixed width representation as UCS-4. 955956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * to UTF-8. The result will be terminated with a 0 byte. 956ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * 957ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor * Return value: a pointer to a newly allocated UTF-8 string. 958956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 959956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 960956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 961ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor **/ 962ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylorgchar * 963956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_ucs4_to_utf8 (const gunichar *str, 964f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 965f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 966f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 967956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 968ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor{ 969ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor gint result_length; 970956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *result = NULL; 971956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *p; 972ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor gint i; 973ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 974ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor result_length = 0; 975956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 0; len < 0 || i < len ; i++) 976956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 977956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (!str[i]) 978956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor break; 979ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 980956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (str[i] >= 0x80000000) 981956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 982956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 983956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = i; 984956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 985956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 986956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Character out of range for UTF-8")); 987956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 988956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 989956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 990956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result_length += UTF8_LENGTH (str[i]); 991956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 992ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 993ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor result = g_malloc (result_length + 1); 994ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor p = result; 995ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 996956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor i = 0; 997956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (p < result + result_length) 998956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor p += g_unichar_to_utf8 (str[i++], p); 999ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1000ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor *p = '\0'; 1001ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 1002956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1003956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = p - result; 1004956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1005956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1006956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1007956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = i; 1008956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1009956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 1010956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1011956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1012956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) 1013956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1014956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1015956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf16_to_utf8: 1016956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-16 encoded string 101750d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 1018956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * the string is terminated with a 0 character. 1019956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of words read, or %NULL. 1020956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1021956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 1022956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 1023956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 1024956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of bytes written, or %NULL. 1025956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value stored here does not include the trailing 1026956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 0 byte. 1027956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1028956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1029956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1030956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1031956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UTF-16 to UTF-8. The result will be 1032956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * terminated with a 0 byte. 1033a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * 1034a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * Note that the input is expected to be already in native endianness, 1035a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * an initial byte-order-mark character is not handled specially. 1036a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * g_convert() can be used to convert a byte buffer of UTF-16 data of 1037a4026b4cbff5d59b94a7e4ffabe764f6211839f6Matthias Clasen * ambiguous endianess. 1038956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1039956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UTF-8 string. 1040956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1041956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1042956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1043956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1044956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgchar * 1045956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf16_to_utf8 (const gunichar2 *str, 1046f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1047f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1048f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1049956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1050956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1051956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ 1052956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * are marked. 1053956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 1054956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gunichar2 *in; 1055956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *out; 1056956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *result = NULL; 1057956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n_bytes; 1058956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar high_surrogate; 1059956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1060956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_return_val_if_fail (str != 0, NULL); 1061956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1062956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes = 0; 1063956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1064956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1065956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || in - str < len) && *in) 1066956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1067956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1068956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1069956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1070956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1071956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1072956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1073956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1074956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1075956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1076956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1077956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1078956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1079956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1080956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid sequence in conversion input")); 1081956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1082956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1083956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1084956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1085956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1086956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1087956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1088956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1089956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid sequence in conversion input")); 1090956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1091956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1092956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1093956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1094956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1095956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1096956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next1; 1097956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1098956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1099956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1100956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1101956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1102956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1103956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes += UTF8_LENGTH (wc); 1104956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1105956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next1: 1106956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1107956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1108956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1109956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate && !items_read) 1110956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1111956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 1112956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Partial character sequence at end of input")); 1113956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1114956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1115956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1116956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /* At this point, everything is valid, and we just need to convert 1117956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 1118956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1119956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_malloc (n_bytes + 1); 1120956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1121956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1122956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out = result; 1123956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1124956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (out < result + n_bytes) 1125956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1126956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1127956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1128956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1129956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1130956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1131956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1132956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1133956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1134956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1135956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1136956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1137956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next2; 1138956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1139956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1140956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1141956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1142956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1143956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out += g_unichar_to_utf8 (wc, out); 1144956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1145956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next2: 1146956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1147956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1148956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1149956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1150956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *out = '\0'; 1151956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1152956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1153956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1154956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = out - result; 1155956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1156956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1157956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1158956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 1159956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1160956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 1161956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1162956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1163956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1164956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf16_to_ucs4: 1165956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-16 encoded string 116650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 1167956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * the string is terminated with a 0 character. 1168956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of words read, or %NULL. 1169956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1170956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 1171956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 1172956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 1173956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of characters written, or %NULL. 1174956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value stored here does not include the trailing 1175956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 0 character. 1176956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1177956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1178956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1179956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1180956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UTF-16 to UCS-4. The result will be 1181956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * terminated with a 0 character. 1182956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1183956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UCS-4 string. 1184956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1185956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1186956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1187956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1188956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar * 1189956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf16_to_ucs4 (const gunichar2 *str, 1190f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1191f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1192f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1193956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1194956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1195956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gunichar2 *in; 1196956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *out; 1197956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gchar *result = NULL; 1198956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n_bytes; 1199956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar high_surrogate; 1200956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1201956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_return_val_if_fail (str != 0, NULL); 1202956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1203956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes = 0; 1204956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1205956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1206956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || in - str < len) && *in) 1207956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1208956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1209956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1210956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1211956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1212956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1213956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1214956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1215956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1216956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1217956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1218956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1219956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1220956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1221956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid sequence in conversion input")); 1222956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1223956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1224956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1225956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1226956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1227956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate) 1228956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1229956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1230956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid sequence in conversion input")); 1231956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1232956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1233956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1234956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1235956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1236956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1237956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next1; 1238956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1239956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1240956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1241956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1242956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1243956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1244956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n_bytes += sizeof (gunichar); 1245956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1246956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next1: 1247956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1248956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1249956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1250956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (high_surrogate && !items_read) 1251956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1252956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 1253956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Partial character sequence at end of input")); 1254956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1255956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1256956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1257956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /* At this point, everything is valid, and we just need to convert 1258956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor */ 1259956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1260956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_malloc (n_bytes + 4); 1261956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1262956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1263956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out = result; 1264956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1265956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while (out < result + n_bytes) 1266956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1267956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 c = *in; 1268956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc; 1269956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1270956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1271956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1272956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = SURROGATE_VALUE (high_surrogate, c); 1273956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = 0; 1274956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1275956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1276956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1277956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor high_surrogate = c; 1278956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto next2; 1279956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1280956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1281956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor wc = c; 1282956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1283956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1284956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *(gunichar *)out = wc; 1285956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor out += sizeof (gunichar); 1286956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1287956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor next2: 1288956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in++; 1289956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1290956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1291956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1292956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *(gunichar *)out = 0; 1293956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1294956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1295956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor /********** DIFFERENT for UTF8/UCS4 **********/ 1296956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = (out - result) / sizeof (gunichar); 1297956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1298956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1299956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1300956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 1301956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1302956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return (gunichar *)result; 1303956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1304956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1305956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1306956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_utf8_to_utf16: 1307956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UTF-8 encoded string 130850d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 130950d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * the string is nul-terminated. 1310956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of bytes read, or %NULL. 1311956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1312956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * returned in case @str contains a trailing partial 1313956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * character. If an error occurs then the index of the 1314956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * invalid input is stored here. 1315956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of words written, or %NULL. 1316956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value stored here does not include the trailing 1317956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 0 word. 1318956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1319956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1320956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1321956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1322956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UTF-8 to UTF-16. A 0 word will be 1323956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * added to the result after the converted text. 1324956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1325956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UTF-16 string. 1326956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1327956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1328956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1329956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1330956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar2 * 1331956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_utf8_to_utf16 (const gchar *str, 1332f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1333f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1334f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1335956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1336956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1337956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 *result = NULL; 1338956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n16; 1339956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor const gchar *in; 1340956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint i; 1341956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1342956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_return_val_if_fail (str != NULL, NULL); 1343956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1344956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1345956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 = 0; 1346956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || str + len - in > 0) && *in) 1347956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1348956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = g_utf8_get_char_extended (in, str + len - in); 1349956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc & 0x80000000) 1350956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1351956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc == (gunichar)-2) 1352956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1353956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1354956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor break; 1355956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1356956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 1357956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Partial character sequence at end of input")); 1358956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1359956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1360956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1361956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid byte sequence in conversion input")); 1362956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1363956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1364956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1365956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1366956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0xd800) 1367956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1368956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xe000) 1369956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1370956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1371956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid sequence in conversion input")); 1372956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1373956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1374956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1375956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x10000) 1376956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1377956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x110000) 1378956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 2; 1379956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1380956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1381956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1382956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Character out of range for UTF-16")); 1383956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1384956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1385956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1386956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1387956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 1388956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1389956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1390956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar2, n16 + 1); 1391956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1392956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = str; 1393956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 0; i < n16;) 1394956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1395956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = g_utf8_get_char (in); 1396956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1397956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x10000) 1398956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1399956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i++] = wc; 1400956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1401956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1402956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1403956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i++] = (wc - 0x10000) / 0x400 + 0xd800; 1404956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i++] = (wc - 0x10000) % 0x400 + 0xdc00; 1405956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1406956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1407956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor in = g_utf8_next_char (in); 1408956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1409956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1410956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[i] = 0; 1411956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1412956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1413956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = n16; 1414956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1415956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1416956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1417956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = in - str; 1418956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1419956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor return result; 1420956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor} 1421956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1422956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor/** 1423956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * g_ucs4_to_utf16: 1424956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @str: a UCS-4 encoded string 142550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 142650d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * the string is terminated with a 0 character. 1427956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_read: location to store number of bytes read, or %NULL. 1428956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * If an error occurs then the index of the invalid input 1429956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * is stored here. 1430956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @items_written: location to store number of words written, or %NULL. 1431956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * The value stored here does not include the trailing 1432956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 0 word. 1433956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error: location to store the error occuring, or %NULL to ignore 1434956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * errors. Any of the errors in #GConvertError other than 1435956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1436956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1437956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Convert a string from UCS-4 to UTF-16. A 0 word will be 1438956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * added to the result after the converted text. 1439956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * 1440956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * Return value: a pointer to a newly allocated UTF-16 string. 1441956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * This value must be freed with g_free(). If an 1442956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * error occurs, %NULL will be returned and 1443956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor * @error set. 1444956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor **/ 1445956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorgunichar2 * 1446956f00ed96228526cbeda1432df1f729e6f13322Owen Taylorg_ucs4_to_utf16 (const gunichar *str, 1447f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong len, 1448f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_read, 1449f5c28ce4ab8e8015a1432060b6cfe547183b2f9eOwen Taylor glong *items_written, 1450956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor GError **error) 1451956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor{ 1452956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar2 *result = NULL; 1453956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint n16; 1454956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gint i, j; 1455956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1456956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 = 0; 1457956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor i = 0; 1458956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor while ((len < 0 || i < len) && str[i]) 1459956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1460956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = str[i]; 1461956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1462956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0xd800) 1463956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1464956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0xe000) 1465956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1466956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1467956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Invalid sequence in conversion input")); 1468956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1469956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1470956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1471956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x10000) 1472956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 1; 1473956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else if (wc < 0x110000) 1474956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor n16 += 2; 1475956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1476956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1477956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1478956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor _("Character out of range for UTF-16")); 1479956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1480956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor goto err_out; 1481956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1482956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1483956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor i++; 1484956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1485956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1486956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result = g_new (gunichar2, n16 + 1); 1487956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1488956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor for (i = 0, j = 0; j < n16; i++) 1489956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1490956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor gunichar wc = str[i]; 1491956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1492956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (wc < 0x10000) 1493956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1494956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j++] = wc; 1495956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1496956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor else 1497956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor { 1498956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j++] = (wc - 0x10000) / 0x400 + 0xd800; 1499956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j++] = (wc - 0x10000) % 0x400 + 0xdc00; 1500956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1501956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor } 1502956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor result[j] = 0; 1503956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1504956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_written) 1505956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_written = n16; 1506956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1507956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor err_out: 1508956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor if (items_read) 1509956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor *items_read = i; 1510956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 1511ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor return result; 1512ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor} 1513ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor 151440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen#define CONTINUATION_CHAR \ 151540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen G_STMT_START { \ 151640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ 151740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; \ 151840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val <<= 6; \ 151940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val |= (*(guchar *)p) & 0x3f; \ 152040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } G_STMT_END 152140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 152240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenstatic const gchar * 152340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenfast_validate (const char *str) 152440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 152540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen{ 152640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar val = 0; 152740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar min = 0; 152840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *p; 152940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 153040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen for (p = str; *p; p++) 153140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 153240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (*(guchar *)p < 128) 153340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen /* done */; 153440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 153540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 153640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *last; 153740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 153840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen last = p; 153940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ 154040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 154140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) 154240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 154340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 154440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ 154540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 154640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 154740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 154840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 154940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ 155040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 155140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 11); 155240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x0f; 155340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto TWO_REMAINING; 155440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 155540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ 155640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 155740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 16); 155840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x07; 155940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 156040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 156140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 156240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 156340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 156440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 156540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen TWO_REMAINING: 156640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 156740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 156840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 156940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 157040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 157140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (val < min)) 157240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 157340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 157440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (!UNICODE_VALID(val))) 157540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 157640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 157740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 157840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen continue; 157940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 158040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen error: 158140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return last; 158240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 158340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 158440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 158540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return p; 158640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen} 158740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 158840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenstatic const gchar * 158940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasenfast_validate_len (const char *str, 159040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gssize max_len) 159140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 159240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen{ 159340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar val = 0; 159440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gunichar min = 0; 159540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *p; 159640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 159740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen for (p = str; (max_len < 0 || (p - str) < max_len) && *p; p++) 159840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 159940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (*(guchar *)p < 128) 160040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen /* done */; 160140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 160240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 160340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar *last; 160440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 160540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen last = p; 160640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ 160740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 160840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 2)) 160940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 161040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 161140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) 161240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 161340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 161440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ 161540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 161640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 161740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 161840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 161940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ 162040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 162140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 3)) 162240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 162340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 162440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 11); 162540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x0f; 162640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto TWO_REMAINING; 162740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 162840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ 162940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen { 163040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 4)) 163140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 163240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 163340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen min = (1 << 16); 163440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen val = *(guchar *)p & 0x07; 163540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 163640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 163740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 163840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 163940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 164040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 164140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen TWO_REMAINING: 164240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 164340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 164440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p++; 164540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen CONTINUATION_CHAR; 164640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 164740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (val < min)) 164840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 164940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (G_UNLIKELY (!UNICODE_VALID(val))) 165040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen goto error; 165140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 165240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 165340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen continue; 165440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 165540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen error: 165640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return last; 165740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 165840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen } 165940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 166040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen return p; 166140fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen} 166240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 1663ddbb2ea440c537a88d55761ad2bc5fbd23149b6aOwen Taylor/** 166449c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * g_utf8_validate: 166549c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * @str: a pointer to character data 166640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * @max_len: max bytes to validate, or -1 to go until NUL 166749c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * @end: return location for end of valid data 166849c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * 166949c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * Validates UTF-8 encoded text. @str is the text to validate; 167049c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * if @str is nul-terminated, then @max_len can be -1, otherwise 167149c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * @max_len should be the number of bytes to validate. 1672ebec3d7ce7c2ca56ab1167403669a74bfe12b110Havoc Pennington * If @end is non-%NULL, then the end of the valid range 167340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * will be stored there (i.e. the start of the first invalid 167440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * character if some bytes were invalid, or the end of the text 167540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * being validated otherwise). 167640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * 167740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * Note that g_utf8_validate() returns %FALSE if @max_len is 167840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen * positive and NUL is met before @max_len bytes have been read. 167949c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * 168050d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 168150d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * routines <emphasis>require</emphasis> valid UTF-8 as input; 168249c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * so data read from a file or the network should be checked 168349c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * with g_utf8_validate() before doing anything else with it. 168449c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington * 168550d0ad98034e764808c774f6ad496cf9f129dd16Matthias Clasen * Return value: %TRUE if the text was valid UTF-8 168649c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington **/ 168749c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Penningtongboolean 168840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Claseng_utf8_validate (const char *str, 168940fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen gssize max_len, 169040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen const gchar **end) 169149c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington 169240fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen{ 169349c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington const gchar *p; 1694956f00ed96228526cbeda1432df1f729e6f13322Owen Taylor 169540fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if (max_len < 0) 169640fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p = fast_validate (str); 169740fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen else 169840fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen p = fast_validate_len (str, max_len); 169949c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington 170049c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington if (end) 170149c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington *end = p; 1702b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington 170340fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen if ((max_len >= 0 && p != str + max_len) || 170440fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen (max_len < 0 && *p != '\0')) 1705b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington return FALSE; 1706b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington else 1707b0baf3db03355028f7c7eabdef9bcfa7f4e08eafHavoc Pennington return TRUE; 170849c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington} 170949c937fcbb0add57ab215c0c65ba3a02e6fb13c9Havoc Pennington 171040fb4cff1019d266e4f7bdcac361d67406b54f45Matthias Clasen 1711fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington/** 1712fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * g_unichar_validate: 1713fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * @ch: a Unicode character 1714fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * 1715fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * Checks whether @ch is a valid Unicode character. Some possible 1716fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * integer values of @ch will not be valid. 0 is considered a valid 1717fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * character, though it's normally a string terminator. 1718fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * 1719fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington * Return value: %TRUE if @ch is a valid Unicode character 1720fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington **/ 1721fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Penningtongboolean 1722fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Penningtong_unichar_validate (gunichar ch) 1723fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington{ 1724fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington return UNICODE_VALID (ch); 1725fad8693b761ce42506f4c3ec1cfb11c2067eb0c6Havoc Pennington} 17261bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 17271bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen/** 17281bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * g_utf8_strreverse: 17291bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * @str: a UTF-8 encoded string 17301bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * @len: the maximum length of @str to use. If @len < 0, then 17311bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * the string is nul-terminated. 17321bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * 17331bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 17341bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * (Use g_utf8_validate() on all text before trying to use UTF-8 17351bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * utility functions with it.) 17361bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * 17371bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * Note that unlike g_strreverse(), this function returns 17381bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * newly-allocated memory, which should be freed with g_free() when 17391bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * no longer needed. 17401bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * 17411bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen * Returns: a newly-allocated string which is the reverse of @str. 1742a69dc4b65d07db32e200f1100bdeab898720c3c3Matthias Clasen * 1743a69dc4b65d07db32e200f1100bdeab898720c3c3Matthias Clasen * Since: 2.2 17441bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen */ 17451bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasengchar * 17461bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Claseng_utf8_strreverse (const gchar *str, 17471bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen gssize len) 17481bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen{ 17491bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen gchar *result; 17501bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen const gchar *p; 17511bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen gchar *m, *r, skip; 17521bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 17531bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen if (len < 0) 17541bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen len = strlen (str); 17551bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 17561bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen result = g_new (gchar, len + 1); 17571bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen r = result + len; 17581bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen p = str; 17591bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen while (*p) 17601bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen { 17611bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen skip = g_utf8_skip[*(guchar*)p]; 17621bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen r -= skip; 17631bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen for (m = r; skip; skip--) 17641bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen *m++ = *p++; 17651bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen } 17661bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen result[len] = 0; 17671bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen 17681bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen return result; 17691bb885b3b2cdf5e2371b6ec0f05f3ada95735f93Matthias Clasen} 1770608a31b98e1420f487190871ee7312db2643d93dMatthias Clasen 1771608a31b98e1420f487190871ee7312db2643d93dMatthias Clasen#define __G_UTF8_C__ 1772608a31b98e1420f487190871ee7312db2643d93dMatthias Clasen#include "galiasdef.c" 1773