1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code.  In place of
5** a legal notice, here is a blessing:
6**
7**    May you do good and not evil.
8**    May you find forgiveness for yourself and forgive others.
9**    May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** Implementation of the "simple" full-text-search tokenizer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19**     * The FTS2 module is being built as an extension
20**       (in which case SQLITE_CORE is not defined), or
21**
22**     * The FTS2 module is being built into the core of
23**       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32
33#include "fts2_tokenizer.h"
34
35typedef struct simple_tokenizer {
36  sqlite3_tokenizer base;
37  char delim[128];             /* flag ASCII delimiters */
38} simple_tokenizer;
39
40typedef struct simple_tokenizer_cursor {
41  sqlite3_tokenizer_cursor base;
42  const char *pInput;          /* input we are tokenizing */
43  int nBytes;                  /* size of the input */
44  int iOffset;                 /* current position in pInput */
45  int iToken;                  /* index of next token to be returned */
46  char *pToken;                /* storage for current token */
47  int nTokenAllocated;         /* space allocated to zToken buffer */
48} simple_tokenizer_cursor;
49
50
51/* Forward declaration */
52static const sqlite3_tokenizer_module simpleTokenizerModule;
53
54static int simpleDelim(simple_tokenizer *t, unsigned char c){
55  return c<0x80 && t->delim[c];
56}
57
58/*
59** Create a new tokenizer instance.
60*/
61static int simpleCreate(
62  int argc, const char * const *argv,
63  sqlite3_tokenizer **ppTokenizer
64){
65  simple_tokenizer *t;
66
67  t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
68  if( t==NULL ) return SQLITE_NOMEM;
69  memset(t, 0, sizeof(*t));
70
71  /* TODO(shess) Delimiters need to remain the same from run to run,
72  ** else we need to reindex.  One solution would be a meta-table to
73  ** track such information in the database, then we'd only want this
74  ** information on the initial create.
75  */
76  if( argc>1 ){
77    int i, n = strlen(argv[1]);
78    for(i=0; i<n; i++){
79      unsigned char ch = argv[1][i];
80      /* We explicitly don't support UTF-8 delimiters for now. */
81      if( ch>=0x80 ){
82        sqlite3_free(t);
83        return SQLITE_ERROR;
84      }
85      t->delim[ch] = 1;
86    }
87  } else {
88    /* Mark non-alphanumeric ASCII characters as delimiters */
89    int i;
90    for(i=1; i<0x80; i++){
91      t->delim[i] = !((i>='0' && i<='9') || (i>='A' && i<='Z') ||
92                      (i>='a' && i<='z'));
93    }
94  }
95
96  *ppTokenizer = &t->base;
97  return SQLITE_OK;
98}
99
100/*
101** Destroy a tokenizer
102*/
103static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
104  sqlite3_free(pTokenizer);
105  return SQLITE_OK;
106}
107
108/*
109** Prepare to begin tokenizing a particular string.  The input
110** string to be tokenized is pInput[0..nBytes-1].  A cursor
111** used to incrementally tokenize this string is returned in
112** *ppCursor.
113*/
114static int simpleOpen(
115  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
116  const char *pInput, int nBytes,        /* String to be tokenized */
117  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
118){
119  simple_tokenizer_cursor *c;
120
121  c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
122  if( c==NULL ) return SQLITE_NOMEM;
123
124  c->pInput = pInput;
125  if( pInput==0 ){
126    c->nBytes = 0;
127  }else if( nBytes<0 ){
128    c->nBytes = (int)strlen(pInput);
129  }else{
130    c->nBytes = nBytes;
131  }
132  c->iOffset = 0;                 /* start tokenizing at the beginning */
133  c->iToken = 0;
134  c->pToken = NULL;               /* no space allocated, yet. */
135  c->nTokenAllocated = 0;
136
137  *ppCursor = &c->base;
138  return SQLITE_OK;
139}
140
141/*
142** Close a tokenization cursor previously opened by a call to
143** simpleOpen() above.
144*/
145static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
146  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
147  sqlite3_free(c->pToken);
148  sqlite3_free(c);
149  return SQLITE_OK;
150}
151
152/*
153** Extract the next token from a tokenization cursor.  The cursor must
154** have been opened by a prior call to simpleOpen().
155*/
156static int simpleNext(
157  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
158  const char **ppToken,               /* OUT: *ppToken is the token text */
159  int *pnBytes,                       /* OUT: Number of bytes in token */
160  int *piStartOffset,                 /* OUT: Starting offset of token */
161  int *piEndOffset,                   /* OUT: Ending offset of token */
162  int *piPosition                     /* OUT: Position integer of token */
163){
164  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
165  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
166  unsigned char *p = (unsigned char *)c->pInput;
167
168  while( c->iOffset<c->nBytes ){
169    int iStartOffset;
170
171    /* Scan past delimiter characters */
172    while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
173      c->iOffset++;
174    }
175
176    /* Count non-delimiter characters. */
177    iStartOffset = c->iOffset;
178    while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
179      c->iOffset++;
180    }
181
182    if( c->iOffset>iStartOffset ){
183      int i, n = c->iOffset-iStartOffset;
184      if( n>c->nTokenAllocated ){
185        c->nTokenAllocated = n+20;
186        c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
187        if( c->pToken==NULL ) return SQLITE_NOMEM;
188      }
189      for(i=0; i<n; i++){
190        /* TODO(shess) This needs expansion to handle UTF-8
191        ** case-insensitivity.
192        */
193        unsigned char ch = p[iStartOffset+i];
194        c->pToken[i] = (ch>='A' && ch<='Z') ? (ch - 'A' + 'a') : ch;
195      }
196      *ppToken = c->pToken;
197      *pnBytes = n;
198      *piStartOffset = iStartOffset;
199      *piEndOffset = c->iOffset;
200      *piPosition = c->iToken++;
201
202      return SQLITE_OK;
203    }
204  }
205  return SQLITE_DONE;
206}
207
208/*
209** The set of routines that implement the simple tokenizer
210*/
211static const sqlite3_tokenizer_module simpleTokenizerModule = {
212  0,
213  simpleCreate,
214  simpleDestroy,
215  simpleOpen,
216  simpleClose,
217  simpleNext,
218};
219
220/*
221** Allocate a new simple tokenizer.  Return a pointer to the new
222** tokenizer in *ppModule
223*/
224void sqlite3Fts2SimpleTokenizerModule(
225  sqlite3_tokenizer_module const**ppModule
226){
227  *ppModule = &simpleTokenizerModule;
228}
229
230#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
231