1/*
2** 2007 May 6
3**
4** The author disclaims copyright to this source code.  In place of
5** a legal notice, here is a blessing:
6**
7**    May you do good and not evil.
8**    May you find forgiveness for yourself and forgive others.
9**    May you share freely, never taking more than you give.
10**
11*************************************************************************
12** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
13**
14** This file implements an integration between the ICU library
15** ("International Components for Unicode", an open-source library
16** for handling unicode data) and SQLite. The integration uses
17** ICU to provide the following to SQLite:
18**
19**   * An implementation of the SQL regexp() function (and hence REGEXP
20**     operator) using the ICU uregex_XX() APIs.
21**
22**   * Implementations of the SQL scalar upper() and lower() functions
23**     for case mapping.
24**
25**   * Integration of ICU and SQLite collation seqences.
26**
27**   * An implementation of the LIKE operator that uses ICU to
28**     provide case-independent matching.
29*/
30
31#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
32
33/* Include ICU headers */
34#include <unicode/utypes.h>
35#include <unicode/uregex.h>
36#include <unicode/ustring.h>
37#include <unicode/ucol.h>
38
39#include <assert.h>
40
41#ifndef SQLITE_CORE
42  #include "sqlite3ext.h"
43  SQLITE_EXTENSION_INIT1
44#else
45  #include "sqlite3.h"
46#endif
47
48/*
49** Maximum length (in bytes) of the pattern in a LIKE or GLOB
50** operator.
51*/
52#ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
53# define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
54#endif
55
56/*
57** Version of sqlite3_free() that is always a function, never a macro.
58*/
59static void xFree(void *p){
60  sqlite3_free(p);
61}
62
63/*
64** Compare two UTF-8 strings for equality where the first string is
65** a "LIKE" expression. Return true (1) if they are the same and
66** false (0) if they are different.
67*/
68static int icuLikeCompare(
69  const uint8_t *zPattern,   /* LIKE pattern */
70  const uint8_t *zString,    /* The UTF-8 string to compare against */
71  const UChar32 uEsc         /* The escape character */
72){
73  static const int MATCH_ONE = (UChar32)'_';
74  static const int MATCH_ALL = (UChar32)'%';
75
76  int iPattern = 0;       /* Current byte index in zPattern */
77  int iString = 0;        /* Current byte index in zString */
78
79  int prevEscape = 0;     /* True if the previous character was uEsc */
80
81  while( zPattern[iPattern]!=0 ){
82
83    /* Read (and consume) the next character from the input pattern. */
84    UChar32 uPattern;
85    U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);
86    assert(uPattern!=0);
87
88    /* There are now 4 possibilities:
89    **
90    **     1. uPattern is an unescaped match-all character "%",
91    **     2. uPattern is an unescaped match-one character "_",
92    **     3. uPattern is an unescaped escape character, or
93    **     4. uPattern is to be handled as an ordinary character
94    */
95    if( !prevEscape && uPattern==MATCH_ALL ){
96      /* Case 1. */
97      uint8_t c;
98
99      /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
100      ** MATCH_ALL. For each MATCH_ONE, skip one character in the
101      ** test string.
102      */
103      while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
104        if( c==MATCH_ONE ){
105          if( zString[iString]==0 ) return 0;
106          U8_FWD_1_UNSAFE(zString, iString);
107        }
108        iPattern++;
109      }
110
111      if( zPattern[iPattern]==0 ) return 1;
112
113      while( zString[iString] ){
114        if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
115          return 1;
116        }
117        U8_FWD_1_UNSAFE(zString, iString);
118      }
119      return 0;
120
121    }else if( !prevEscape && uPattern==MATCH_ONE ){
122      /* Case 2. */
123      if( zString[iString]==0 ) return 0;
124      U8_FWD_1_UNSAFE(zString, iString);
125
126    }else if( !prevEscape && uPattern==uEsc){
127      /* Case 3. */
128      prevEscape = 1;
129
130    }else{
131      /* Case 4. */
132      UChar32 uString;
133      U8_NEXT_UNSAFE(zString, iString, uString);
134      uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
135      uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
136      if( uString!=uPattern ){
137        return 0;
138      }
139      prevEscape = 0;
140    }
141  }
142
143  return zString[iString]==0;
144}
145
146/*
147** Implementation of the like() SQL function.  This function implements
148** the build-in LIKE operator.  The first argument to the function is the
149** pattern and the second argument is the string.  So, the SQL statements:
150**
151**       A LIKE B
152**
153** is implemented as like(B, A). If there is an escape character E,
154**
155**       A LIKE B ESCAPE E
156**
157** is mapped to like(B, A, E).
158*/
159static void icuLikeFunc(
160  sqlite3_context *context,
161  int argc,
162  sqlite3_value **argv
163){
164  const unsigned char *zA = sqlite3_value_text(argv[0]);
165  const unsigned char *zB = sqlite3_value_text(argv[1]);
166  UChar32 uEsc = 0;
167
168  /* Limit the length of the LIKE or GLOB pattern to avoid problems
169  ** of deep recursion and N*N behavior in patternCompare().
170  */
171  if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
172    sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
173    return;
174  }
175
176
177  if( argc==3 ){
178    /* The escape character string must consist of a single UTF-8 character.
179    ** Otherwise, return an error.
180    */
181    int nE= sqlite3_value_bytes(argv[2]);
182    const unsigned char *zE = sqlite3_value_text(argv[2]);
183    int i = 0;
184    if( zE==0 ) return;
185    U8_NEXT(zE, i, nE, uEsc);
186    if( i!=nE){
187      sqlite3_result_error(context,
188          "ESCAPE expression must be a single character", -1);
189      return;
190    }
191  }
192
193  if( zA && zB ){
194    sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
195  }
196}
197
198/*
199** This function is called when an ICU function called from within
200** the implementation of an SQL scalar function returns an error.
201**
202** The scalar function context passed as the first argument is
203** loaded with an error message based on the following two args.
204*/
205static void icuFunctionError(
206  sqlite3_context *pCtx,       /* SQLite scalar function context */
207  const char *zName,           /* Name of ICU function that failed */
208  UErrorCode e                 /* Error code returned by ICU function */
209){
210  char zBuf[128];
211  sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
212  zBuf[127] = '\0';
213  sqlite3_result_error(pCtx, zBuf, -1);
214}
215
216/*
217** Function to delete compiled regexp objects. Registered as
218** a destructor function with sqlite3_set_auxdata().
219*/
220static void icuRegexpDelete(void *p){
221  URegularExpression *pExpr = (URegularExpression *)p;
222  uregex_close(pExpr);
223}
224
225/*
226** Implementation of SQLite REGEXP operator. This scalar function takes
227** two arguments. The first is a regular expression pattern to compile
228** the second is a string to match against that pattern. If either
229** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
230** is 1 if the string matches the pattern, or 0 otherwise.
231**
232** SQLite maps the regexp() function to the regexp() operator such
233** that the following two are equivalent:
234**
235**     zString REGEXP zPattern
236**     regexp(zPattern, zString)
237**
238** Uses the following ICU regexp APIs:
239**
240**     uregex_open()
241**     uregex_matches()
242**     uregex_close()
243*/
244static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
245  UErrorCode status = U_ZERO_ERROR;
246  URegularExpression *pExpr;
247  UBool res;
248  const UChar *zString = sqlite3_value_text16(apArg[1]);
249
250  (void)nArg;  /* Unused parameter */
251
252  /* If the left hand side of the regexp operator is NULL,
253  ** then the result is also NULL.
254  */
255  if( !zString ){
256    return;
257  }
258
259  pExpr = sqlite3_get_auxdata(p, 0);
260  if( !pExpr ){
261    const UChar *zPattern = sqlite3_value_text16(apArg[0]);
262    if( !zPattern ){
263      return;
264    }
265    pExpr = uregex_open(zPattern, -1, 0, 0, &status);
266
267    if( U_SUCCESS(status) ){
268      sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
269    }else{
270      assert(!pExpr);
271      icuFunctionError(p, "uregex_open", status);
272      return;
273    }
274  }
275
276  /* Configure the text that the regular expression operates on. */
277  uregex_setText(pExpr, zString, -1, &status);
278  if( !U_SUCCESS(status) ){
279    icuFunctionError(p, "uregex_setText", status);
280    return;
281  }
282
283  /* Attempt the match */
284  res = uregex_matches(pExpr, 0, &status);
285  if( !U_SUCCESS(status) ){
286    icuFunctionError(p, "uregex_matches", status);
287    return;
288  }
289
290  /* Set the text that the regular expression operates on to a NULL
291  ** pointer. This is not really necessary, but it is tidier than
292  ** leaving the regular expression object configured with an invalid
293  ** pointer after this function returns.
294  */
295  uregex_setText(pExpr, 0, 0, &status);
296
297  /* Return 1 or 0. */
298  sqlite3_result_int(p, res ? 1 : 0);
299}
300
301/*
302** Implementations of scalar functions for case mapping - upper() and
303** lower(). Function upper() converts its input to upper-case (ABC).
304** Function lower() converts to lower-case (abc).
305**
306** ICU provides two types of case mapping, "general" case mapping and
307** "language specific". Refer to ICU documentation for the differences
308** between the two.
309**
310** To utilise "general" case mapping, the upper() or lower() scalar
311** functions are invoked with one argument:
312**
313**     upper('ABC') -> 'abc'
314**     lower('abc') -> 'ABC'
315**
316** To access ICU "language specific" case mapping, upper() or lower()
317** should be invoked with two arguments. The second argument is the name
318** of the locale to use. Passing an empty string ("") or SQL NULL value
319** as the second argument is the same as invoking the 1 argument version
320** of upper() or lower().
321**
322**     lower('I', 'en_us') -> 'i'
323**     lower('I', 'tr_tr') -> 'ı' (small dotless i)
324**
325** http://www.icu-project.org/userguide/posix.html#case_mappings
326*/
327static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
328  const UChar *zInput;
329  UChar *zOutput;
330  int nInput;
331  int nOutput;
332
333  UErrorCode status = U_ZERO_ERROR;
334  const char *zLocale = 0;
335
336  assert(nArg==1 || nArg==2);
337  if( nArg==2 ){
338    zLocale = (const char *)sqlite3_value_text(apArg[1]);
339  }
340
341  zInput = sqlite3_value_text16(apArg[0]);
342  if( !zInput ){
343    return;
344  }
345  nInput = sqlite3_value_bytes16(apArg[0]);
346
347  nOutput = nInput * 2 + 2;
348  zOutput = sqlite3_malloc(nOutput);
349  if( !zOutput ){
350    return;
351  }
352
353  if( sqlite3_user_data(p) ){
354    u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
355  }else{
356    u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
357  }
358
359  if( !U_SUCCESS(status) ){
360    icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
361    return;
362  }
363
364  sqlite3_result_text16(p, zOutput, -1, xFree);
365}
366
367/*
368** Collation sequence destructor function. The pCtx argument points to
369** a UCollator structure previously allocated using ucol_open().
370*/
371static void icuCollationDel(void *pCtx){
372  UCollator *p = (UCollator *)pCtx;
373  ucol_close(p);
374}
375
376/*
377** Collation sequence comparison function. The pCtx argument points to
378** a UCollator structure previously allocated using ucol_open().
379*/
380static int icuCollationColl(
381  void *pCtx,
382  int nLeft,
383  const void *zLeft,
384  int nRight,
385  const void *zRight
386){
387  UCollationResult res;
388  UCollator *p = (UCollator *)pCtx;
389  res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
390  switch( res ){
391    case UCOL_LESS:    return -1;
392    case UCOL_GREATER: return +1;
393    case UCOL_EQUAL:   return 0;
394  }
395  assert(!"Unexpected return value from ucol_strcoll()");
396  return 0;
397}
398
399/*
400** Implementation of the scalar function icu_load_collation().
401**
402** This scalar function is used to add ICU collation based collation
403** types to an SQLite database connection. It is intended to be called
404** as follows:
405**
406**     SELECT icu_load_collation(<locale>, <collation-name>);
407**
408** Where <locale> is a string containing an ICU locale identifier (i.e.
409** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
410** collation sequence to create.
411*/
412static void icuLoadCollation(
413  sqlite3_context *p,
414  int nArg,
415  sqlite3_value **apArg
416){
417  sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
418  UErrorCode status = U_ZERO_ERROR;
419  const char *zLocale;      /* Locale identifier - (eg. "jp_JP") */
420  const char *zName;        /* SQL Collation sequence name (eg. "japanese") */
421  UCollator *pUCollator;    /* ICU library collation object */
422  int rc;                   /* Return code from sqlite3_create_collation_x() */
423
424  assert(nArg==2);
425  zLocale = (const char *)sqlite3_value_text(apArg[0]);
426  zName = (const char *)sqlite3_value_text(apArg[1]);
427
428  if( !zLocale || !zName ){
429    return;
430  }
431
432  pUCollator = ucol_open(zLocale, &status);
433  if( !U_SUCCESS(status) ){
434    icuFunctionError(p, "ucol_open", status);
435    return;
436  }
437  assert(p);
438
439  rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
440      icuCollationColl, icuCollationDel
441  );
442  if( rc!=SQLITE_OK ){
443    ucol_close(pUCollator);
444    sqlite3_result_error(p, "Error registering collation function", -1);
445  }
446}
447
448/*
449** Register the ICU extension functions with database db.
450*/
451int sqlite3IcuInit(sqlite3 *db){
452  struct IcuScalar {
453    const char *zName;                        /* Function name */
454    int nArg;                                 /* Number of arguments */
455    int enc;                                  /* Optimal text encoding */
456    void *pContext;                           /* sqlite3_user_data() context */
457    void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
458  } scalars[] = {
459    {"regexp", 2, SQLITE_ANY,          0, icuRegexpFunc},
460
461    {"lower",  1, SQLITE_UTF16,        0, icuCaseFunc16},
462    {"lower",  2, SQLITE_UTF16,        0, icuCaseFunc16},
463    {"upper",  1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
464    {"upper",  2, SQLITE_UTF16, (void*)1, icuCaseFunc16},
465
466    {"lower",  1, SQLITE_UTF8,         0, icuCaseFunc16},
467    {"lower",  2, SQLITE_UTF8,         0, icuCaseFunc16},
468    {"upper",  1, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
469    {"upper",  2, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
470
471    {"like",   2, SQLITE_UTF8,         0, icuLikeFunc},
472    {"like",   3, SQLITE_UTF8,         0, icuLikeFunc},
473
474    {"icu_load_collation",  2, SQLITE_UTF8, (void*)db, icuLoadCollation},
475  };
476
477  int rc = SQLITE_OK;
478  int i;
479
480  for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
481    struct IcuScalar *p = &scalars[i];
482    rc = sqlite3_create_function(
483        db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
484    );
485  }
486
487  return rc;
488}
489
490#if !SQLITE_CORE
491int sqlite3_extension_init(
492  sqlite3 *db,
493  char **pzErrMsg,
494  const sqlite3_api_routines *pApi
495){
496  SQLITE_EXTENSION_INIT2(pApi)
497  return sqlite3IcuInit(db);
498}
499#endif
500
501#endif
502