15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** 2007 June 22
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** The author disclaims copyright to this source code.  In place of
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** a legal notice, here is a blessing:
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**    May you do good and not evil.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**    May you find forgiveness for yourself and forgive others.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**    May you share freely, never taking more than you give.
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*************************************************************************
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** This file implements a tokenizer for fts2 based on the ICU library.
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)**
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifdef SQLITE_ENABLE_ICU
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <assert.h>
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string.h>
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "fts2_tokenizer.h"
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <unicode/ubrk.h>
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <unicode/ucol.h>
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <unicode/ustring.h>
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <unicode/utf16.h>
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef struct IcuTokenizer IcuTokenizer;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef struct IcuCursor IcuCursor;
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct IcuTokenizer {
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_tokenizer base;
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char *zLocale;
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct IcuCursor {
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_tokenizer_cursor base;
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UBreakIterator *pIter;      /* ICU break-iterator object */
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nChar;                  /* Number of UChar elements in pInput */
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UChar *aChar;               /* Copy of input using utf-16 encoding */
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int *aOffset;               /* Offsets of each character in utf-8 input */
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nBuffer;
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char *zBuffer;
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int iToken;
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Create a new tokenizer instance.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int icuCreate(
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int argc,                            /* Number of entries in argv[] */
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char * const *argv,            /* Tokenizer creation arguments */
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IcuTokenizer *p;
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int n = 0;
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( argc>0 ){
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    n = strlen(argv[0])+1;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( !p ){
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return SQLITE_NOMEM;
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(p, 0, sizeof(IcuTokenizer));
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( n ){
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    p->zLocale = (char *)&p[1];
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    memcpy(p->zLocale, argv[0], n);
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *ppTokenizer = (sqlite3_tokenizer *)p;
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return SQLITE_OK;
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Destroy a tokenizer
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int icuDestroy(sqlite3_tokenizer *pTokenizer){
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_free(p);
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return SQLITE_OK;
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Prepare to begin tokenizing a particular string.  The input
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** string to be tokenized is pInput[0..nBytes-1].  A cursor
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** used to incrementally tokenize this string is returned in
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** *ppCursor.
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int icuOpen(
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char *zInput,                    /* Input string */
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nInput,                            /* Length of zInput in bytes */
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IcuCursor *pCsr;
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int32_t opt = U_FOLD_CASE_DEFAULT;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nChar;
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UChar32 c;
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int iInput = 0;
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int iOut = 0;
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *ppCursor = 0;
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( nInput<0 ){
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nInput = strlen(zInput);
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  nChar = nInput+1;
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pCsr = (IcuCursor *)sqlite3_malloc(
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      sizeof(IcuCursor) +                /* IcuCursor */
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (nChar+1) * sizeof(int) +          /* IcuCursor.aOffset[] */
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nChar * sizeof(UChar)              /* IcuCursor.aChar[] */
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  );
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( !pCsr ){
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return SQLITE_NOMEM;
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(pCsr, 0, sizeof(IcuCursor));
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pCsr->aOffset = (int *)&pCsr[1];
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pCsr->aChar = (UChar *)&pCsr->aOffset[nChar+1];
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pCsr->aOffset[iOut] = iInput;
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  U8_NEXT(zInput, iInput, nInput, c);
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while( c>0 ){
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int isError = 0;
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = u_foldCase(c, opt);
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( isError ){
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      sqlite3_free(pCsr);
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return SQLITE_ERROR;
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pCsr->aOffset[iOut] = iInput;
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( iInput<nInput ){
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      U8_NEXT(zInput, iInput, nInput, c);
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }else{
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      c = 0;
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if( !U_SUCCESS(status) ){
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sqlite3_free(pCsr);
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return SQLITE_ERROR;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pCsr->nChar = iOut;
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ubrk_first(pCsr->pIter);
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return SQLITE_OK;
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Close a tokenization cursor previously opened by a call to icuOpen().
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int icuClose(sqlite3_tokenizer_cursor *pCursor){
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IcuCursor *pCsr = (IcuCursor *)pCursor;
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ubrk_close(pCsr->pIter);
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_free(pCsr->zBuffer);
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_free(pCsr);
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return SQLITE_OK;
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Extract the next token from a tokenization cursor.
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int icuNext(
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char **ppToken,               /* OUT: *ppToken is the token text */
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int *pnBytes,                       /* OUT: Number of bytes in token */
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int *piStartOffset,                 /* OUT: Starting offset of token */
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int *piEndOffset,                   /* OUT: Ending offset of token */
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int *piPosition                     /* OUT: Position integer of token */
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IcuCursor *pCsr = (IcuCursor *)pCursor;
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int iStart = 0;
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int iEnd = 0;
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nByte = 0;
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while( iStart==iEnd ){
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UChar32 c;
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    iStart = ubrk_current(pCsr->pIter);
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    iEnd = ubrk_next(pCsr->pIter);
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( iEnd==UBRK_DONE ){
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return SQLITE_DONE;
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while( iStart<iEnd ){
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      int iWhite = iStart;
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if( u_isspace(c) ){
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        iStart = iWhite;
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }else{
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    assert(iStart<=iEnd);
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  do {
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UErrorCode status = U_ZERO_ERROR;
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if( nByte ){
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if( !zNew ){
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return SQLITE_NOMEM;
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      pCsr->zBuffer = zNew;
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      pCsr->nBuffer = nByte;
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    u_strToUTF8(
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        &status                                  /* Output success/failure */
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    );
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } while( nByte>pCsr->nBuffer );
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *ppToken = pCsr->zBuffer;
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *pnBytes = nByte;
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *piStartOffset = pCsr->aOffset[iStart];
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *piEndOffset = pCsr->aOffset[iEnd];
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *piPosition = pCsr->iToken++;
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return SQLITE_OK;
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** The set of routines that implement the simple tokenizer
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const sqlite3_tokenizer_module icuTokenizerModule = {
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,                           /* iVersion */
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icuCreate,                   /* xCreate  */
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icuDestroy,                  /* xCreate  */
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icuOpen,                     /* xOpen    */
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icuClose,                    /* xClose   */
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icuNext,                     /* xNext    */
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Set *ppModule to point at the implementation of the ICU tokenizer.
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void sqlite3Fts2IcuTokenizerModule(
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sqlite3_tokenizer_module const**ppModule
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *ppModule = &icuTokenizerModule;
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif /* defined(SQLITE_ENABLE_ICU) */
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
261