1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code.  In place of
5** a legal notice, here is a blessing:
6**
7**    May you do good and not evil.
8**    May you find forgiveness for yourself and forgive others.
9**    May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file implements a tokenizer for fts3 based on the ICU library.
13**
14** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $
15*/
16
17#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
18#ifdef SQLITE_ENABLE_ICU
19
20#include <assert.h>
21#include <string.h>
22#include "fts3_tokenizer.h"
23
24#include <unicode/ubrk.h>
25#include <unicode/ucol.h>
26#include <unicode/ustring.h>
27#include <unicode/utf16.h>
28
29typedef struct IcuTokenizer IcuTokenizer;
30typedef struct IcuCursor IcuCursor;
31
32struct IcuTokenizer {
33  sqlite3_tokenizer base;
34  char *zLocale;
35};
36
37struct IcuCursor {
38  sqlite3_tokenizer_cursor base;
39
40  UBreakIterator *pIter;      /* ICU break-iterator object */
41  int nChar;                  /* Number of UChar elements in pInput */
42  UChar *aChar;               /* Copy of input using utf-16 encoding */
43  int *aOffset;               /* Offsets of each character in utf-8 input */
44
45  int nBuffer;
46  char *zBuffer;
47
48  int iToken;
49};
50
51/*
52** Create a new tokenizer instance.
53*/
54static int icuCreate(
55  int argc,                            /* Number of entries in argv[] */
56  const char * const *argv,            /* Tokenizer creation arguments */
57  sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
58){
59  IcuTokenizer *p;
60  int n = 0;
61
62  if( argc>0 ){
63    n = strlen(argv[0])+1;
64  }
65  p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66  if( !p ){
67    return SQLITE_NOMEM;
68  }
69  memset(p, 0, sizeof(IcuTokenizer));
70
71  if( n ){
72    p->zLocale = (char *)&p[1];
73    memcpy(p->zLocale, argv[0], n);
74  }
75
76  *ppTokenizer = (sqlite3_tokenizer *)p;
77
78  return SQLITE_OK;
79}
80
81/*
82** Destroy a tokenizer
83*/
84static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86  sqlite3_free(p);
87  return SQLITE_OK;
88}
89
90/*
91** Prepare to begin tokenizing a particular string.  The input
92** string to be tokenized is pInput[0..nBytes-1].  A cursor
93** used to incrementally tokenize this string is returned in
94** *ppCursor.
95*/
96static int icuOpen(
97  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
98  const char *zInput,                    /* Input string */
99  int nInput,                            /* Length of zInput in bytes */
100  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
101){
102  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103  IcuCursor *pCsr;
104
105  const int32_t opt = U_FOLD_CASE_DEFAULT;
106  UErrorCode status = U_ZERO_ERROR;
107  int nChar;
108
109  UChar32 c;
110  int iInput = 0;
111  int iOut = 0;
112
113  *ppCursor = 0;
114
115  if( nInput<0 ){
116    nInput = strlen(zInput);
117  }
118  nChar = nInput+1;
119  pCsr = (IcuCursor *)sqlite3_malloc(
120      sizeof(IcuCursor) +                /* IcuCursor */
121      (nChar+1) * sizeof(int) +          /* IcuCursor.aOffset[] */
122      nChar * sizeof(UChar)              /* IcuCursor.aChar[] */
123  );
124  if( !pCsr ){
125    return SQLITE_NOMEM;
126  }
127  memset(pCsr, 0, sizeof(IcuCursor));
128  pCsr->aOffset = (int *)&pCsr[1];
129  pCsr->aChar = (UChar *)&pCsr->aOffset[nChar+1];
130
131  pCsr->aOffset[iOut] = iInput;
132  U8_NEXT(zInput, iInput, nInput, c);
133  while( c>0 ){
134    int isError = 0;
135    c = u_foldCase(c, opt);
136    U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
137    if( isError ){
138      sqlite3_free(pCsr);
139      return SQLITE_ERROR;
140    }
141    pCsr->aOffset[iOut] = iInput;
142
143    if( iInput<nInput ){
144      U8_NEXT(zInput, iInput, nInput, c);
145    }else{
146      c = 0;
147    }
148  }
149
150  pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
151  if( !U_SUCCESS(status) ){
152    sqlite3_free(pCsr);
153    return SQLITE_ERROR;
154  }
155  pCsr->nChar = iOut;
156
157  ubrk_first(pCsr->pIter);
158  *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
159  return SQLITE_OK;
160}
161
162/*
163** Close a tokenization cursor previously opened by a call to icuOpen().
164*/
165static int icuClose(sqlite3_tokenizer_cursor *pCursor){
166  IcuCursor *pCsr = (IcuCursor *)pCursor;
167  ubrk_close(pCsr->pIter);
168  sqlite3_free(pCsr->zBuffer);
169  sqlite3_free(pCsr);
170  return SQLITE_OK;
171}
172
173/*
174** Extract the next token from a tokenization cursor.
175*/
176static int icuNext(
177  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
178  const char **ppToken,               /* OUT: *ppToken is the token text */
179  int *pnBytes,                       /* OUT: Number of bytes in token */
180  int *piStartOffset,                 /* OUT: Starting offset of token */
181  int *piEndOffset,                   /* OUT: Ending offset of token */
182  int *piPosition                     /* OUT: Position integer of token */
183){
184  IcuCursor *pCsr = (IcuCursor *)pCursor;
185
186  int iStart = 0;
187  int iEnd = 0;
188  int nByte = 0;
189
190  while( iStart==iEnd ){
191    UChar32 c;
192
193    iStart = ubrk_current(pCsr->pIter);
194    iEnd = ubrk_next(pCsr->pIter);
195    if( iEnd==UBRK_DONE ){
196      return SQLITE_DONE;
197    }
198
199    while( iStart<iEnd ){
200      int iWhite = iStart;
201      U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
202      if( u_isspace(c) ){
203        iStart = iWhite;
204      }else{
205        break;
206      }
207    }
208    assert(iStart<=iEnd);
209  }
210
211  do {
212    UErrorCode status = U_ZERO_ERROR;
213    if( nByte ){
214      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
215      if( !zNew ){
216        return SQLITE_NOMEM;
217      }
218      pCsr->zBuffer = zNew;
219      pCsr->nBuffer = nByte;
220    }
221
222    u_strToUTF8(
223        pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
224        &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
225        &status                                  /* Output success/failure */
226    );
227  } while( nByte>pCsr->nBuffer );
228
229  *ppToken = pCsr->zBuffer;
230  *pnBytes = nByte;
231  *piStartOffset = pCsr->aOffset[iStart];
232  *piEndOffset = pCsr->aOffset[iEnd];
233  *piPosition = pCsr->iToken++;
234
235  return SQLITE_OK;
236}
237
238/*
239** The set of routines that implement the simple tokenizer
240*/
241static const sqlite3_tokenizer_module icuTokenizerModule = {
242  0,                           /* iVersion */
243  icuCreate,                   /* xCreate  */
244  icuDestroy,                  /* xCreate  */
245  icuOpen,                     /* xOpen    */
246  icuClose,                    /* xClose   */
247  icuNext,                     /* xNext    */
248};
249
250/*
251** Set *ppModule to point at the implementation of the ICU tokenizer.
252*/
253void sqlite3Fts3IcuTokenizerModule(
254  sqlite3_tokenizer_module const**ppModule
255){
256  *ppModule = &icuTokenizerModule;
257}
258
259#endif /* defined(SQLITE_ENABLE_ICU) */
260#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
261