1b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij/**
2b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * \file unicode.c
3b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij *
4b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * This file contains general Unicode string manipulation functions.
5b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * It mainly consist of functions for converting between UCS-2 (used on
6c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * the devices) and UTF-8 (used by several applications).
7c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij *
8c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * For a deeper understanding of Unicode encoding formats see the
9c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * Wikipedia entries for
10c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
11c00f70d8b4f20bcdd03bbf10b5ac718b672cc797Linus Walleij * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
122f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij *
136db174f4d18af574fab991309e11010aa0eb4543Linus Walleij * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se>
142f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij *
152f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * This library is free software; you can redistribute it and/or
162f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * modify it under the terms of the GNU Lesser General Public
172f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * License as published by the Free Software Foundation; either
182f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * version 2 of the License, or (at your option) any later version.
192f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij *
202f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * This library is distributed in the hope that it will be useful,
212f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * but WITHOUT ANY WARRANTY; without even the implied warranty of
222f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
232f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * Lesser General Public License for more details.
242f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij *
252f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * You should have received a copy of the GNU Lesser General Public
262f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * License along with this library; if not, write to the
272f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
282f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij * Boston, MA 02111-1307, USA.
292f45d224ce34d3fdc5eb255cde6ed9e2c407f218Linus Walleij *
30b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */
31b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij
32e65fc0b7ffe71f8aede07c4c5a0f2a9893389543Lei Zhang#include "config.h"
33e65fc0b7ffe71f8aede07c4c5a0f2a9893389543Lei Zhang
34b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include <stdlib.h>
35b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include <string.h>
366db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#ifdef HAVE_ICONV
37c6d7c983221a453142444377f7168db627f747a5Linus Walleij#include "iconv.h"
386db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#else
396db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#error "libmtp unicode.c needs fixing to work without iconv()!"
406db174f4d18af574fab991309e11010aa0eb4543Linus Walleij#endif
41b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include "libmtp.h"
42b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include "unicode.h"
43b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij#include "util.h"
44d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij#include "ptp.h"
4516571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij
46b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij/**
47b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * The size of the buffer (in characters) used for creating string copies.
48b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */
4916571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij#define STRING_BUFFER_LENGTH 1024
50b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij
51f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij/**
52f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * Gets the length (in characters, not bytes) of a unicode
53b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
54b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * will return a value of 1.
55b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij *
56b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij * @param unicstr a UCS-2 Unicode string
57f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @return the length of the string, in number of characters. If you
58b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij *         want to know the length in bytes, multiply this by two and
59b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij *         add two (for zero terminator).
60b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */
61438bd7fdc131da4214f40705eb10745f72b19b5fLinus Walleijint ucs2_strlen(uint16_t const * const unicstr)
62b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij{
63b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij  int length;
64f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij
65b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij  /* Unicode strings are terminated with 2 * 0x00 */
66b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij  for(length = 0; unicstr[length] != 0x0000U; length ++);
67b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij  return length;
68b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij}
69b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij
70b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij/**
7116571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij * Converts a big-endian UTF-16 2-byte string
72d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij * to a UTF-8 string. Actually just a UCS-2 internal conversion
73d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij * routine that strips off the BOM if there is one.
7416571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij *
75eab650b3948989622e477f7046b483fa3b7b0b2eLinus Walleij * @param device a pointer to the current device.
7616571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij * @param unicstr the UTF-16 unicode string to convert
7716571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij * @return a UTF-8 string.
78b9256fd19e26a86ce8dedb9284684fd8b13de2e1Linus Walleij */
793ec863176f87f621068888241e923f1599e820aeLinus Walleijchar *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
8016571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij{
81d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  PTPParams *params = (PTPParams *) device->params;
82d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  char *stringp = (char *) unicstr;
83d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
84d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  char *locp = loclstr;
85d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  size_t nconv;
86d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
87d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  size_t convmax = STRING_BUFFER_LENGTH*3;
88f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij
89d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  loclstr[0]='\0';
90d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  /* Do the conversion.  */
91d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
92d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  if (nconv == (size_t) -1) {
93d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij    // Return partial string anyway.
94d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij    *locp = '\0';
9516571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij  }
96d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  loclstr[STRING_BUFFER_LENGTH*3] = '\0';
97d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  // Strip off any BOM, it's totally useless...
98d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
99d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij    return strdup(loclstr+3);
10016571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij  }
101d5d51c86f6c5429fe360ce526a7c6bf3414ddacbLinus Walleij  return strdup(loclstr);
10216571dcb63b48338c4d45a0a1e7c7610aa3ca92aLinus Walleij}
103d3b7857c89301fdada125333751706278f75140fLinus Walleij
104d3b7857c89301fdada125333751706278f75140fLinus Walleij/**
105f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * Converts a UTF-8 string to a big-endian UTF-16 2-byte string
106f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * Actually just a UCS-2 internal conversion.
107f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij *
108f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @param device a pointer to the current device.
109f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @param localstr the UTF-8 unicode string to convert
110f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij * @return a UTF-16 string.
111f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij */
112f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleijuint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr)
113f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij{
114f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  PTPParams *params = (PTPParams *) device->params;
115f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  char *stringp = (char *) localstr; // cast away "const"
116f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
117f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  char *unip = unicstr;
118f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  size_t nconv = 0;
119f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
120f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  size_t convmax = STRING_BUFFER_LENGTH*2;
121f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij
122f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  unicstr[0]='\0';
123f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  unicstr[1]='\0';
124f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij
125f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  /* Do the conversion.  */
126f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);
127f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij
128f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  if (nconv == (size_t) -1) {
129f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij    // Return partial string anyway.
130f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij    unip[0] = '\0';
131f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij    unip[1] = '\0';
132f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  }
133f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  // make sure the string is null terminated
134f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  unicstr[STRING_BUFFER_LENGTH*2] = '\0';
135f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';
136f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij
137f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  // allocate the string to be returned
138f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  // Note: can't use strdup since every other byte is a null byte
139f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2;
140f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  uint16_t* ret = malloc(ret_len);
141f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  memcpy(ret,unicstr,(size_t)ret_len);
142f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij  return ret;
143f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij}
144f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij
145f3c4405edbf6a9b5c61057c0c2ffbaf067e89cf4Linus Walleij/**
146d3b7857c89301fdada125333751706278f75140fLinus Walleij * This helper function simply removes any consecutive chars
147d3b7857c89301fdada125333751706278f75140fLinus Walleij * > 0x7F and replace then with an underscore. In UTF-8
148d3b7857c89301fdada125333751706278f75140fLinus Walleij * consequtive chars > 0x7F represent one single character so
149d3b7857c89301fdada125333751706278f75140fLinus Walleij * it has to be done like this (and it's elegant). It will only
150d3b7857c89301fdada125333751706278f75140fLinus Walleij * shrink the string in size so no copying is needed.
151d3b7857c89301fdada125333751706278f75140fLinus Walleij */
152d3b7857c89301fdada125333751706278f75140fLinus Walleijvoid strip_7bit_from_utf8(char *str)
153d3b7857c89301fdada125333751706278f75140fLinus Walleij{
154d3b7857c89301fdada125333751706278f75140fLinus Walleij  int i,j,k;
155d3b7857c89301fdada125333751706278f75140fLinus Walleij  i = 0;
156d3b7857c89301fdada125333751706278f75140fLinus Walleij  j = 0;
157d3b7857c89301fdada125333751706278f75140fLinus Walleij  k = strlen(str);
158d3b7857c89301fdada125333751706278f75140fLinus Walleij  while (i < k) {
159d3b7857c89301fdada125333751706278f75140fLinus Walleij    if ((uint8_t) str[i] > 0x7FU) {
160d3b7857c89301fdada125333751706278f75140fLinus Walleij      str[j] = '_';
1611a90559fbd568c1428235324b764d3dc4bb1b5e9Linus Walleij      i++;
162d3b7857c89301fdada125333751706278f75140fLinus Walleij      // Skip over any consequtive > 0x7F chars.
163d3b7857c89301fdada125333751706278f75140fLinus Walleij      while((uint8_t) str[i] > 0x7FU) {
164d3b7857c89301fdada125333751706278f75140fLinus Walleij	i++;
165d3b7857c89301fdada125333751706278f75140fLinus Walleij      }
166d3b7857c89301fdada125333751706278f75140fLinus Walleij    } else {
167d3b7857c89301fdada125333751706278f75140fLinus Walleij      str[j] = str[i];
168d3b7857c89301fdada125333751706278f75140fLinus Walleij      i++;
169d3b7857c89301fdada125333751706278f75140fLinus Walleij    }
170d3b7857c89301fdada125333751706278f75140fLinus Walleij    j++;
171d3b7857c89301fdada125333751706278f75140fLinus Walleij  }
172d3b7857c89301fdada125333751706278f75140fLinus Walleij  // Terminate stripped string...
173d3b7857c89301fdada125333751706278f75140fLinus Walleij  str[j] = '\0';
174d3b7857c89301fdada125333751706278f75140fLinus Walleij}
175