1b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij/** 2b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * \file unicode.c 3b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * 4b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * This file contains general Unicode string manipulation functions. 5b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * It mainly consist of functions for converting between UCS-2 (used on 6c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * the devices) and UTF-8 (used by several applications). 7c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * 8c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * For a deeper understanding of Unicode encoding formats see the 9c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * Wikipedia entries for 10c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a> 11c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. 122f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * 136db174f4d18af574fab991309e11010aa0eb4543Linus Walleij * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se> 142f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * 152f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * This library is free software; you can redistribute it and/or 162f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * modify it under the terms of the GNU Lesser General Public 172f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * License as published by the Free Software Foundation; either 182f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * version 2 of the License, or (at your option) any later version. 192f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * 202f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * This library is distributed in the hope that it will be useful, 212f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * but WITHOUT ANY WARRANTY; without even the implied warranty of 222f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 232f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * Lesser General Public License for more details. 242f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * 252f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * You should have received a copy of the GNU Lesser General Public 262f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * License along with this library; if not, write to the 272f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 282f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * Boston, MA 02111-1307, USA. 292f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * 30b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */ 31b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij 32e65fc0b7ffe71f8aede07c4c5a0f2a9893389543Lei Zhang#include "config.h" 33e65fc0b7ffe71f8aede07c4c5a0f2a9893389543Lei Zhang 34b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include <stdlib.h> 35b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include <string.h> 366db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#ifdef HAVE_ICONV 37c6d7c983221a453142444377f7168db627f747a5Linus Walleij#include "iconv.h" 386db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#else 396db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#error "libmtp unicode.c needs fixing to work without iconv()!" 406db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#endif 41b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include "libmtp.h" 42b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include "unicode.h" 43b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include "util.h" 44d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij#include "ptp.h" 4516571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij 46b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij/** 47b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * The size of the buffer (in characters) used for creating string copies. 48b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */ 4916571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij#define STRING_BUFFER_LENGTH 1024 50b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij 51f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij/** 52f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * Gets the length (in characters, not bytes) of a unicode 53b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00 54b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * will return a value of 1. 55b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * 56b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * @param unicstr a UCS-2 Unicode string 57f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @return the length of the string, in number of characters. If you 58b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * want to know the length in bytes, multiply this by two and 59b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * add two (for zero terminator). 60b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */ 61438bd7fdc131da4214f40705eb10745f72b19b5fLinus Walleijint ucs2_strlen(uint16_t const * const unicstr) 62b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij{ 63b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij int length; 64f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij 65b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij /* Unicode strings are terminated with 2 * 0x00 */ 66b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij for(length = 0; unicstr[length] != 0x0000U; length ++); 67b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij return length; 68b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij} 69b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij 70b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij/** 7116571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij * Converts a big-endian UTF-16 2-byte string 72d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij * to a UTF-8 string. Actually just a UCS-2 internal conversion 73d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij * routine that strips off the BOM if there is one. 7416571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij * 75eab650b3948989622e477f7046b483fa3b7b0b2eLinus Walleij * @param device a pointer to the current device. 7616571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij * @param unicstr the UTF-16 unicode string to convert 7716571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij * @return a UTF-8 string. 78b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */ 793ec863176f87f621068888241e923f1599e820aeLinus Walleijchar *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr) 8016571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij{ 81d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij PTPParams *params = (PTPParams *) device->params; 82d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij char *stringp = (char *) unicstr; 83d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char. 84d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij char *locp = loclstr; 85d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij size_t nconv; 86d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator 87d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij size_t convmax = STRING_BUFFER_LENGTH*3; 88f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij 89d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij loclstr[0]='\0'; 90d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij /* Do the conversion. */ 91d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax); 92d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij if (nconv == (size_t) -1) { 93d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij // Return partial string anyway. 94d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij *locp = '\0'; 9516571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij } 96d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij loclstr[STRING_BUFFER_LENGTH*3] = '\0'; 97d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij // Strip off any BOM, it's totally useless... 98d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) { 99d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij return strdup(loclstr+3); 10016571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij } 101d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij return strdup(loclstr); 10216571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij} 103d3b7857c89301fdada125333751706278f75140fLinus Walleij 104d3b7857c89301fdada125333751706278f75140fLinus Walleij/** 105f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * Converts a UTF-8 string to a big-endian UTF-16 2-byte string 106f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * Actually just a UCS-2 internal conversion. 107f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * 108f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @param device a pointer to the current device. 109f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @param localstr the UTF-8 unicode string to convert 110f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @return a UTF-16 string. 111f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij */ 112f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleijuint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr) 113f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij{ 114f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij PTPParams *params = (PTPParams *) device->params; 115f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij char *stringp = (char *) localstr; // cast away "const" 116f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char. 117f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij char *unip = unicstr; 118f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij size_t nconv = 0; 119f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator 120f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij size_t convmax = STRING_BUFFER_LENGTH*2; 121f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij 122f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij unicstr[0]='\0'; 123f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij unicstr[1]='\0'; 124f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij 125f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij /* Do the conversion. */ 126f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax); 127f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij 128f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij if (nconv == (size_t) -1) { 129f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij // Return partial string anyway. 130f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij unip[0] = '\0'; 131f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij unip[1] = '\0'; 132f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij } 133f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij // make sure the string is null terminated 134f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij unicstr[STRING_BUFFER_LENGTH*2] = '\0'; 135f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij unicstr[STRING_BUFFER_LENGTH*2+1] = '\0'; 136f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij 137f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij // allocate the string to be returned 138f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij // Note: can't use strdup since every other byte is a null byte 139f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2; 140f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij uint16_t* ret = malloc(ret_len); 141f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij memcpy(ret,unicstr,(size_t)ret_len); 142f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij return ret; 143f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij} 144f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij 145f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij/** 146d3b7857c89301fdada125333751706278f75140fLinus Walleij * This helper function simply removes any consecutive chars 147d3b7857c89301fdada125333751706278f75140fLinus Walleij * > 0x7F and replace then with an underscore. In UTF-8 148d3b7857c89301fdada125333751706278f75140fLinus Walleij * consequtive chars > 0x7F represent one single character so 149d3b7857c89301fdada125333751706278f75140fLinus Walleij * it has to be done like this (and it's elegant). It will only 150d3b7857c89301fdada125333751706278f75140fLinus Walleij * shrink the string in size so no copying is needed. 151d3b7857c89301fdada125333751706278f75140fLinus Walleij */ 152d3b7857c89301fdada125333751706278f75140fLinus Walleijvoid strip_7bit_from_utf8(char *str) 153d3b7857c89301fdada125333751706278f75140fLinus Walleij{ 154d3b7857c89301fdada125333751706278f75140fLinus Walleij int i,j,k; 155d3b7857c89301fdada125333751706278f75140fLinus Walleij i = 0; 156d3b7857c89301fdada125333751706278f75140fLinus Walleij j = 0; 157d3b7857c89301fdada125333751706278f75140fLinus Walleij k = strlen(str); 158d3b7857c89301fdada125333751706278f75140fLinus Walleij while (i < k) { 159d3b7857c89301fdada125333751706278f75140fLinus Walleij if ((uint8_t) str[i] > 0x7FU) { 160d3b7857c89301fdada125333751706278f75140fLinus Walleij str[j] = '_'; 1611a90559fbd568c1428235324b764d3dc4bb1b5e9Linus Walleij i++; 162d3b7857c89301fdada125333751706278f75140fLinus Walleij // Skip over any consequtive > 0x7F chars. 163d3b7857c89301fdada125333751706278f75140fLinus Walleij while((uint8_t) str[i] > 0x7FU) { 164d3b7857c89301fdada125333751706278f75140fLinus Walleij i++; 165d3b7857c89301fdada125333751706278f75140fLinus Walleij } 166d3b7857c89301fdada125333751706278f75140fLinus Walleij } else { 167d3b7857c89301fdada125333751706278f75140fLinus Walleij str[j] = str[i]; 168d3b7857c89301fdada125333751706278f75140fLinus Walleij i++; 169d3b7857c89301fdada125333751706278f75140fLinus Walleij } 170d3b7857c89301fdada125333751706278f75140fLinus Walleij j++; 171d3b7857c89301fdada125333751706278f75140fLinus Walleij } 172d3b7857c89301fdada125333751706278f75140fLinus Walleij // Terminate stripped string... 173d3b7857c89301fdada125333751706278f75140fLinus Walleij str[j] = '\0'; 174d3b7857c89301fdada125333751706278f75140fLinus Walleij} 175