13aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/** 23aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * \file unicode.c 33aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 43aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This file contains general Unicode string manipulation functions. 53aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * It mainly consist of functions for converting between UCS-2 (used on 63aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * the devices) and UTF-8 (used by several applications). 73aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 83aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * For a deeper understanding of Unicode encoding formats see the 93aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Wikipedia entries for 103aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a> 113aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. 123aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 133aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se> 143aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 153aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This library is free software; you can redistribute it and/or 163aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * modify it under the terms of the GNU Lesser General Public 173aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * License as published by the Free Software Foundation; either 183aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * version 2 of the License, or (at your option) any later version. 193aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 203aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This library is distributed in the hope that it will be useful, 213aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * but WITHOUT ANY WARRANTY; without even the implied warranty of 223aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 233aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Lesser General Public License for more details. 243aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 253aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * You should have received a copy of the GNU Lesser General Public 263aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * License along with this library; if not, write to the 273aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 283aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Boston, MA 02111-1307, USA. 293aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 303aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */ 313aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 323aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <config.h> 333aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <stdlib.h> 343aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <string.h> 353aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#ifdef HAVE_ICONV 363aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "iconv.h" 373aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#else 383aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#error "libmtp unicode.c needs fixing to work without iconv()!" 393aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#endif 403aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "libmtp.h" 413aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "unicode.h" 423aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "util.h" 433aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "ptp.h" 443aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 453aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/** 463aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * The size of the buffer (in characters) used for creating string copies. 473aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */ 483aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#define STRING_BUFFER_LENGTH 1024 493aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 503aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/** 513aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Gets the length (in characters, not bytes) of a unicode 523aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00 533aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * will return a value of 1. 543aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 553aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param unicstr a UCS-2 Unicode string 563aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @return the length of the string, in number of characters. If you 573aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * want to know the length in bytes, multiply this by two and 583aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * add two (for zero terminator). 593aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */ 603aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevint ucs2_strlen(uint16_t const * const unicstr) 613aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{ 623aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev int length; 633aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 643aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev /* Unicode strings are terminated with 2 * 0x00 */ 653aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev for(length = 0; unicstr[length] != 0x0000U; length ++); 663aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev return length; 673aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev} 683aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 693aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/** 703aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Converts a big-endian UTF-16 2-byte string 713aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * to a UTF-8 string. Actually just a UCS-2 internal conversion 723aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * routine that strips off the BOM if there is one. 733aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 743aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param device a pointer to the current device. 753aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param unicstr the UTF-16 unicode string to convert 763aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @return a UTF-8 string. 773aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */ 783aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevchar *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr) 793aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{ 803aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev PTPParams *params = (PTPParams *) device->params; 813aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev char *stringp = (char *) unicstr; 823aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char. 833aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev char *locp = loclstr; 843aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev size_t nconv; 853aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator 863aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev size_t convmax = STRING_BUFFER_LENGTH*3; 873aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 883aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev loclstr[0]='\0'; 893aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev /* Do the conversion. */ 903aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax); 913aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev if (nconv == (size_t) -1) { 923aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // Return partial string anyway. 933aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *locp = '\0'; 943aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev } 953aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev loclstr[STRING_BUFFER_LENGTH*3] = '\0'; 963aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // Strip off any BOM, it's totally useless... 973aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) { 983aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev return strdup(loclstr+3); 993aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev } 1003aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev return strdup(loclstr); 1013aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev} 1023aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 1033aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/** 1043aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Converts a UTF-8 string to a big-endian UTF-16 2-byte string 1053aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Actually just a UCS-2 internal conversion. 1063aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * 1073aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param device a pointer to the current device. 1083aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param localstr the UTF-8 unicode string to convert 1093aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @return a UTF-16 string. 1103aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */ 1113aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevuint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr) 1123aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{ 1133aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev PTPParams *params = (PTPParams *) device->params; 1143aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev char *stringp = (char *) localstr; // cast away "const" 1153aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char. 1163aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev char *unip = unicstr; 1173aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev size_t nconv = 0; 1183aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator 1193aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev size_t convmax = STRING_BUFFER_LENGTH*2; 1203aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 1213aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev unicstr[0]='\0'; 1223aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev unicstr[1]='\0'; 1233aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 1243aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev /* Do the conversion. */ 1253aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax); 1263aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 1273aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev if (nconv == (size_t) -1) { 1283aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // Return partial string anyway. 1293aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev unip[0] = '\0'; 1303aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev unip[1] = '\0'; 1313aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev } 1323aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // make sure the string is null terminated 1333aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev unicstr[STRING_BUFFER_LENGTH*2] = '\0'; 1343aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev unicstr[STRING_BUFFER_LENGTH*2+1] = '\0'; 1353aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 1363aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // allocate the string to be returned 1373aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // Note: can't use strdup since every other byte is a null byte 1383aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2; 1393aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev uint16_t* ret = malloc(ret_len); 1403aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev memcpy(ret,unicstr,(size_t)ret_len); 1413aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev return ret; 1423aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev} 1433aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev 1443aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/** 1453aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This helper function simply removes any consecutive chars 1463aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * > 0x7F and replace then with an underscore. In UTF-8 1473aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * consequtive chars > 0x7F represent one single character so 1483aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * it has to be done like this (and it's elegant). It will only 1493aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * shrink the string in size so no copying is needed. 1503aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */ 1513aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevvoid strip_7bit_from_utf8(char *str) 1523aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{ 1533aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev int i,j,k; 1543aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev i = 0; 1553aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev j = 0; 1563aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev k = strlen(str); 1573aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev while (i < k) { 1583aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev if ((uint8_t) str[i] > 0x7FU) { 1593aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev str[j] = '_'; 1603aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev i++; 1613aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // Skip over any consequtive > 0x7F chars. 1623aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev while((uint8_t) str[i] > 0x7FU) { 1633aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev i++; 1643aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev } 1653aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev } else { 1663aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev str[j] = str[i]; 1673aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev i++; 1683aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev } 1693aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev j++; 1703aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev } 1713aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev // Terminate stripped string... 1723aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev str[j] = '\0'; 1733aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev} 174