15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** 2004 April 13
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** The author disclaims copyright to this source code.  In place of
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** a legal notice, here is a blessing:
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**    May you do good and not evil.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**    May you find forgiveness for yourself and forgive others.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**    May you share freely, never taking more than you give.
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*************************************************************************
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** This file contains routines used to translate between UTF-8,
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** UTF-16, UTF-16BE, and UTF-16LE.
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Notes on UTF-8:
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**   Byte-0    Byte-1    Byte-2    Byte-3    Value
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  0xxxxxxx                                 00000000 00000000 0xxxxxxx
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Notes on UTF-16:  (with wwww+1==uuuuu)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**      Word-0               Word-1          Value
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** BOM or Byte Order Mark:
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     0xff 0xfe   little-endian utf-16 follows
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     0xfe 0xff   big-endian utf-16 follows
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "sqliteInt.h"
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <assert.h>
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "vdbeInt.h"
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef SQLITE_AMALGAMATION
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** The following constant value is used by the SQLITE_BIGENDIAN and
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** SQLITE_LITTLEENDIAN macros.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int sqlite3one = 1;
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif /* SQLITE_AMALGAMATION */
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** This lookup table is used to help decode the first byte of
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** a multi-byte UTF8 character.
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const unsigned char sqlite3Utf8Trans1[] = {
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define WRITE_UTF8(zOut, c) {                          \
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( c<0x00080 ){                                     \
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(c&0xFF);                            \
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }                                                    \
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else if( c<0x00800 ){                                \
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }                                                    \
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else if( c<0x10000 ){                                \
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }else{                                               \
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }                                                    \
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define WRITE_UTF16LE(zOut, c) {                                    \
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( c<=0xFFFF ){                                                  \
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(c&0x00FF);                                       \
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)((c>>8)&0x00FF);                                  \
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }else{                                                            \
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(c&0x00FF);                                       \
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }                                                                 \
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define WRITE_UTF16BE(zOut, c) {                                    \
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( c<=0xFFFF ){                                                  \
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)((c>>8)&0x00FF);                                  \
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(c&0x00FF);                                       \
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }else{                                                            \
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *zOut++ = (u8)(c&0x00FF);                                       \
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }                                                                 \
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define READ_UTF16LE(zIn, TERM, c){                                   \
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c = (*zIn++);                                                       \
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c += ((*zIn++)<<8);                                                 \
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( c>=0xD800 && c<0xE000 && TERM ){                                \
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int c2 = (*zIn++);                                                \
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c2 += ((*zIn++)<<8);                                              \
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }                                                                   \
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define READ_UTF16BE(zIn, TERM, c){                                   \
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c = ((*zIn++)<<8);                                                  \
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c += (*zIn++);                                                      \
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( c>=0xD800 && c<0xE000 && TERM ){                                \
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int c2 = ((*zIn++)<<8);                                           \
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c2 += (*zIn++);                                                   \
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }                                                                   \
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Translate a single UTF-8 character.  Return the unicode value.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** During translation, assume that the byte that zTerm points
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** is a 0x00.
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Write a pointer to the next unread byte back into *pzNext.
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Notes On Invalid UTF-8:
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     be encoded as a multi-byte character.  Any multi-byte character that
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  *  This routine never allows a UTF16 surrogate value to be encoded.
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     If a multi-byte character attempts to encode a value between
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     0xd800 and 0xe000 then it is rendered as 0xfffd.
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  *  Bytes in the range of 0x80 through 0xbf which occur as the first
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     byte of a character are interpreted as single-byte characters
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     and rendered as themselves even though they are technically
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     invalid characters.
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**  *  This routine accepts an infinite number of different UTF8 encodings
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     for unicode values 0x80 and greater.  It do not change over-length
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**     encodings to 0xfffd as some systems recommend.
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define READ_UTF8(zIn, zTerm, c)                           \
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c = *(zIn++);                                            \
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( c>=0xc0 ){                                           \
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = sqlite3Utf8Trans1[c-0xc0];                         \
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      c = (c<<6) + (0x3f & *(zIn++));                      \
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }                                                      \
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( c<0x80                                             \
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        || (c&0xFFFFF800)==0xD800                          \
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int sqlite3Utf8Read(
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const unsigned char *zIn,       /* First byte of UTF-8 character */
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const unsigned char **pzNext    /* Write first byte past UTF-8 char here */
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned int c;
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* Same as READ_UTF8() above but without the zTerm parameter.
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ** For this routine, we assume the UTF8 string is always zero-terminated.
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  */
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c = *(zIn++);
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( c>=0xc0 ){
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = sqlite3Utf8Trans1[c-0xc0];
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while( (*zIn & 0xc0)==0x80 ){
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      c = (c<<6) + (0x3f & *(zIn++));
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( c<0x80
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        || (c&0xFFFFF800)==0xD800
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *pzNext = zIn;
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return c;
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* #define TRANSLATE_TRACE 1 */
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef SQLITE_OMIT_UTF16
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** This routine transforms the internal text encoding used by pMem to
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** desiredEnc. It is an error if the string is already of the desired
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** encoding, or if *pMem does not contain a string value.
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int len;                    /* Maximum length of output string in bytes */
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char *zOut;                  /* Output buffer */
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char *zIn;                   /* Input iterator */
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char *zTerm;                 /* End of input */
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char *z;                     /* Output iterator */
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned int c;
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( pMem->flags&MEM_Str );
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( pMem->enc!=desiredEnc );
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( pMem->enc!=0 );
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( pMem->n>=0 );
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    char zBuf[100];
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sqlite3VdbeMemPrettyPrint(pMem, zBuf);
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "INPUT:  %s\n", zBuf);
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* If the translation is between UTF-16 little and big endian, then
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ** all that is required is to swap the byte order. This case is handled
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ** differently from the others.
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  */
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    u8 temp;
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int rc;
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rc = sqlite3VdbeMemMakeWriteable(pMem);
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( rc!=SQLITE_OK ){
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      assert( rc==SQLITE_NOMEM );
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return SQLITE_NOMEM;
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    zIn = (u8*)pMem->z;
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    zTerm = &zIn[pMem->n&~1];
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while( zIn<zTerm ){
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      temp = *zIn;
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *zIn = *(zIn+1);
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      zIn++;
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *zIn++ = temp;
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pMem->enc = desiredEnc;
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    goto translate_out;
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* Set len to the maximum number of bytes required in the output buffer. */
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( desiredEnc==SQLITE_UTF8 ){
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* When converting from UTF-16, the maximum growth results from
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ** translating a 2-byte character to a 4-byte UTF-8 character.
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ** A single byte is required for the output string
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ** nul-terminator.
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    */
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pMem->n &= ~1;
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    len = pMem->n * 2 + 1;
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }else{
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    /* When converting from UTF-8 to UTF-16 the maximum growth is caused
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ** character. Two bytes are required in the output buffer for the
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ** nul-terminator.
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    */
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    len = pMem->n * 2 + 2;
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  /* Set zIn to point at the start of the input buffer and zTerm to point 1
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ** byte past the end.
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  **
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ** Variable zOut is set to point at the output buffer, space obtained
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ** from sqlite3_malloc().
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  */
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  zIn = (u8*)pMem->z;
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  zTerm = &zIn[pMem->n];
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  zOut = sqlite3DbMallocRaw(pMem->db, len);
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( !zOut ){
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return SQLITE_NOMEM;
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  z = zOut;
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( pMem->enc==SQLITE_UTF8 ){
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( desiredEnc==SQLITE_UTF16LE ){
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      /* UTF-8 -> UTF-16 Little-endian */
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      while( zIn<zTerm ){
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        READ_UTF8(zIn, zTerm, c);
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        WRITE_UTF16LE(z, c);
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }else{
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      assert( desiredEnc==SQLITE_UTF16BE );
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      /* UTF-8 -> UTF-16 Big-endian */
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      while( zIn<zTerm ){
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        READ_UTF8(zIn, zTerm, c);
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        WRITE_UTF16BE(z, c);
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pMem->n = (int)(z - zOut);
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *z++ = 0;
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }else{
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( desiredEnc==SQLITE_UTF8 );
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( pMem->enc==SQLITE_UTF16LE ){
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      /* UTF-16 Little-endian -> UTF-8 */
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      while( zIn<zTerm ){
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        READ_UTF16LE(zIn, zIn<zTerm, c);
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        WRITE_UTF8(z, c);
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }else{
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      /* UTF-16 Big-endian -> UTF-8 */
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      while( zIn<zTerm ){
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        READ_UTF16BE(zIn, zIn<zTerm, c);
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        WRITE_UTF8(z, c);
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pMem->n = (int)(z - zOut);
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *z = 0;
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3VdbeMemRelease(pMem);
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pMem->enc = desiredEnc;
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pMem->flags |= (MEM_Term|MEM_Dyn);
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pMem->z = (char*)zOut;
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pMem->zMalloc = pMem->z;
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)translate_out:
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    char zBuf[100];
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sqlite3VdbeMemPrettyPrint(pMem, zBuf);
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "OUTPUT: %s\n", zBuf);
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return SQLITE_OK;
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** This routine checks for a byte-order mark at the beginning of the
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** UTF-16 string stored in *pMem. If one is present, it is removed and
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** the encoding of the Mem adjusted. This routine does not do any
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** byte-swapping, it just sets Mem.enc appropriately.
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** The allocation (static, dynamic etc.) and encoding of the Mem may be
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** changed by this function.
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int sqlite3VdbeMemHandleBom(Mem *pMem){
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int rc = SQLITE_OK;
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  u8 bom = 0;
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( pMem->n>=0 );
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( pMem->n>1 ){
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    u8 b1 = *(u8 *)pMem->z;
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    u8 b2 = *(((u8 *)pMem->z) + 1);
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( b1==0xFE && b2==0xFF ){
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      bom = SQLITE_UTF16BE;
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( b1==0xFF && b2==0xFE ){
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      bom = SQLITE_UTF16LE;
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( bom ){
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rc = sqlite3VdbeMemMakeWriteable(pMem);
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( rc==SQLITE_OK ){
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      pMem->n -= 2;
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      memmove(pMem->z, &pMem->z[2], pMem->n);
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      pMem->z[pMem->n] = '\0';
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      pMem->z[pMem->n+1] = '\0';
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      pMem->flags |= MEM_Term;
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      pMem->enc = bom;
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return rc;
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif /* SQLITE_OMIT_UTF16 */
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** return the number of unicode characters in pZ up to (but not including)
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** the first 0x00 byte. If nByte is not less than zero, return the
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** number of unicode characters in the first nByte of pZ (or up to
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** the first 0x00, whichever comes first).
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int sqlite3Utf8CharLen(const char *zIn, int nByte){
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int r = 0;
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const u8 *z = (const u8*)zIn;
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const u8 *zTerm;
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( nByte>=0 ){
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    zTerm = &z[nByte];
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }else{
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    zTerm = (const u8*)(-1);
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( z<=zTerm );
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while( *z!=0 && z<zTerm ){
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SQLITE_SKIP_UTF8(z);
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    r++;
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return r;
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* This test function is not currently used by the automated test-suite.
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Hence it is only available in debug builds.
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Translate UTF-8 to UTF-8.
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** This has the effect of making sure that the string is well-formed
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** UTF-8.  Miscoded characters are removed.
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** The translation is done in-place and aborted if the output
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** overruns the input.
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int sqlite3Utf8To8(unsigned char *zIn){
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char *zOut = zIn;
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char *zStart = zIn;
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  u32 c;
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while( zIn[0] && zOut<=zIn ){
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = sqlite3Utf8Read(zIn, (const u8**)&zIn);
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( c!=0xfffd ){
4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      WRITE_UTF8(zOut, c);
4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *zOut = 0;
4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return (int)(zOut - zStart);
4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef SQLITE_OMIT_UTF16
4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Convert a UTF-16 string in the native encoding into a UTF-8 string.
4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** be freed by the calling function.
4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** NULL is returned if there is an allocation error.
4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Mem m;
4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(&m, 0, sizeof(m));
4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  m.db = db;
4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( db->mallocFailed ){
4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sqlite3VdbeMemRelease(&m);
4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    m.z = 0;
4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( (m.flags & MEM_Dyn)!=0 || db->mallocFailed );
4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( m.z || db->mallocFailed );
4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return m.z;
4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Convert a UTF-8 string to the UTF-16 encoding specified by parameter
4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** enc. A pointer to the new string is returned, and the value of *pnOut
4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** is set to the length of the returned string in bytes. The call should
4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** arrange to call sqlite3DbFree() on the returned pointer when it is
4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** no longer required.
4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** If a malloc failure occurs, NULL is returned and the db.mallocFailed
4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** flag set.
4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifdef SQLITE_ENABLE_STAT2
4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){
4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Mem m;
4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(&m, 0, sizeof(m));
4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  m.db = db;
4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC);
4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( sqlite3VdbeMemTranslate(&m, enc) ){
4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( db->mallocFailed );
4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return 0;
4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  assert( m.z==m.zMalloc );
4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *pnOut = m.n;
4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return m.z;
4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** zIn is a UTF-16 encoded unicode string at least nChar characters long.
4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Return the number of bytes in the first nChar unicode characters
4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** in pZ.  nChar must be non-negative.
4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int sqlite3Utf16ByteLen(const void *zIn, int nChar){
4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int c;
4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char const *z = zIn;
4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int n = 0;
4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while( n<nChar ){
4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      READ_UTF16BE(z, 1, c);
4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      n++;
4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }else{
4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while( n<nChar ){
5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      READ_UTF16LE(z, 1, c);
5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      n++;
5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return (int)(z-(unsigned char const *)zIn);
5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(SQLITE_TEST)
5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** This routine is called from the TCL test function "translate_selftest".
5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** It checks that the primitives for serializing and deserializing
5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** characters in each encoding are inverses of each other.
5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void sqlite3UtfSelfTest(void){
5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned int i, t;
5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char zBuf[20];
5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned char *z;
5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int n;
5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  unsigned int c;
5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for(i=0; i<0x00110000; i++){
5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z = zBuf;
5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    WRITE_UTF8(z, i);
5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    n = (int)(z-zBuf);
5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( n>0 && n<=4 );
5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z[0] = 0;
5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z = zBuf;
5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = sqlite3Utf8Read(z, (const u8**)&z);
5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    t = i;
5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( c==t );
5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( (z-zBuf)==n );
5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for(i=0; i<0x00110000; i++){
5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( i>=0xD800 && i<0xE000 ) continue;
5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z = zBuf;
5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    WRITE_UTF16LE(z, i);
5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    n = (int)(z-zBuf);
5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( n>0 && n<=4 );
5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z[0] = 0;
5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z = zBuf;
5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    READ_UTF16LE(z, 1, c);
5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( c==i );
5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( (z-zBuf)==n );
5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for(i=0; i<0x00110000; i++){
5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( i>=0xD800 && i<0xE000 ) continue;
5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z = zBuf;
5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    WRITE_UTF16BE(z, i);
5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    n = (int)(z-zBuf);
5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( n>0 && n<=4 );
5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z[0] = 0;
5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    z = zBuf;
5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    READ_UTF16BE(z, 1, c);
5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( c==i );
5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert( (z-zBuf)==n );
5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif /* SQLITE_TEST */
5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif /* SQLITE_OMIT_UTF16 */
561