13aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/**
23aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * \file unicode.c
33aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
43aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This file contains general Unicode string manipulation functions.
53aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * It mainly consist of functions for converting between UCS-2 (used on
63aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * the devices) and UTF-8 (used by several applications).
73aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
83aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * For a deeper understanding of Unicode encoding formats see the
93aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Wikipedia entries for
103aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
113aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
123aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
133aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se>
143aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
153aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This library is free software; you can redistribute it and/or
163aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * modify it under the terms of the GNU Lesser General Public
173aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * License as published by the Free Software Foundation; either
183aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * version 2 of the License, or (at your option) any later version.
193aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
203aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This library is distributed in the hope that it will be useful,
213aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * but WITHOUT ANY WARRANTY; without even the implied warranty of
223aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
233aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Lesser General Public License for more details.
243aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
253aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * You should have received a copy of the GNU Lesser General Public
263aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * License along with this library; if not, write to the
273aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
283aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Boston, MA 02111-1307, USA.
293aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
303aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */
313aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
323aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <config.h>
333aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <stdlib.h>
343aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <string.h>
353aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#ifdef HAVE_ICONV
363aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "iconv.h"
373aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#else
383aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#error "libmtp unicode.c needs fixing to work without iconv()!"
393aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#endif
403aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "libmtp.h"
413aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "unicode.h"
423aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "util.h"
433aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "ptp.h"
443aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
453aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/**
463aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * The size of the buffer (in characters) used for creating string copies.
473aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */
483aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#define STRING_BUFFER_LENGTH 1024
493aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
503aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/**
513aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Gets the length (in characters, not bytes) of a unicode
523aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
533aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * will return a value of 1.
543aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
553aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param unicstr a UCS-2 Unicode string
563aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @return the length of the string, in number of characters. If you
573aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *         want to know the length in bytes, multiply this by two and
583aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *         add two (for zero terminator).
593aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */
603aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevint ucs2_strlen(uint16_t const * const unicstr)
613aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{
623aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  int length;
633aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
643aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  /* Unicode strings are terminated with 2 * 0x00 */
653aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  for(length = 0; unicstr[length] != 0x0000U; length ++);
663aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  return length;
673aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev}
683aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
693aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/**
703aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Converts a big-endian UTF-16 2-byte string
713aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * to a UTF-8 string. Actually just a UCS-2 internal conversion
723aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * routine that strips off the BOM if there is one.
733aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
743aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param device a pointer to the current device.
753aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param unicstr the UTF-16 unicode string to convert
763aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @return a UTF-8 string.
773aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */
783aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevchar *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
793aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{
803aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  PTPParams *params = (PTPParams *) device->params;
813aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  char *stringp = (char *) unicstr;
823aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
833aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  char *locp = loclstr;
843aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  size_t nconv;
853aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
863aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  size_t convmax = STRING_BUFFER_LENGTH*3;
873aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
883aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  loclstr[0]='\0';
893aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  /* Do the conversion.  */
903aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
913aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  if (nconv == (size_t) -1) {
923aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    // Return partial string anyway.
933aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    *locp = '\0';
943aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  }
953aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  loclstr[STRING_BUFFER_LENGTH*3] = '\0';
963aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  // Strip off any BOM, it's totally useless...
973aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
983aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    return strdup(loclstr+3);
993aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  }
1003aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  return strdup(loclstr);
1013aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev}
1023aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
1033aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/**
1043aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Converts a UTF-8 string to a big-endian UTF-16 2-byte string
1053aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Actually just a UCS-2 internal conversion.
1063aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
1073aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param device a pointer to the current device.
1083aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @param localstr the UTF-8 unicode string to convert
1093aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * @return a UTF-16 string.
1103aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */
1113aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevuint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr)
1123aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{
1133aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  PTPParams *params = (PTPParams *) device->params;
1143aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  char *stringp = (char *) localstr; // cast away "const"
1153aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
1163aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  char *unip = unicstr;
1173aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  size_t nconv = 0;
1183aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
1193aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  size_t convmax = STRING_BUFFER_LENGTH*2;
1203aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
1213aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  unicstr[0]='\0';
1223aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  unicstr[1]='\0';
1233aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
1243aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  /* Do the conversion.  */
1253aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);
1263aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
1273aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  if (nconv == (size_t) -1) {
1283aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    // Return partial string anyway.
1293aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    unip[0] = '\0';
1303aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    unip[1] = '\0';
1313aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  }
1323aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  // make sure the string is null terminated
1333aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  unicstr[STRING_BUFFER_LENGTH*2] = '\0';
1343aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';
1353aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
1363aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  // allocate the string to be returned
1373aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  // Note: can't use strdup since every other byte is a null byte
1383aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2;
1393aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  uint16_t* ret = malloc(ret_len);
1403aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  memcpy(ret,unicstr,(size_t)ret_len);
1413aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  return ret;
1423aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev}
1433aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
1443aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/**
1453aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * This helper function simply removes any consecutive chars
1463aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * > 0x7F and replace then with an underscore. In UTF-8
1473aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * consequtive chars > 0x7F represent one single character so
1483aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * it has to be done like this (and it's elegant). It will only
1493aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * shrink the string in size so no copying is needed.
1503aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */
1513aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevvoid strip_7bit_from_utf8(char *str)
1523aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{
1533aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  int i,j,k;
1543aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  i = 0;
1553aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  j = 0;
1563aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  k = strlen(str);
1573aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  while (i < k) {
1583aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    if ((uint8_t) str[i] > 0x7FU) {
1593aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev      str[j] = '_';
1603aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev      i++;
1613aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev      // Skip over any consequtive > 0x7F chars.
1623aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev      while((uint8_t) str[i] > 0x7FU) {
1633aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev	i++;
1643aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev      }
1653aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    } else {
1663aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev      str[j] = str[i];
1673aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev      i++;
1683aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    }
1693aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev    j++;
1703aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  }
1713aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  // Terminate stripped string...
1723aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev  str[j] = '\0';
1733aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev}
174