1/*
2** The author disclaims copyright to this source code.
3**
4*************************************************************************
5** Implementation of the "simple" full-text-search tokenizer.
6*/
7
8#include <assert.h>
9#if !defined(__APPLE__)
10#include <malloc.h>
11#else
12#include <stdlib.h>
13#endif
14#include <stdio.h>
15#include <string.h>
16#include <ctype.h>
17
18#include "tokenizer.h"
19
20/* Duplicate a string; the caller must free() the returned string.
21 * (We don't use strdup() since it's not part of the standard C library and
22 * may not be available everywhere.) */
23/* TODO(shess) Copied from fulltext.c, consider util.c for such
24** things. */
25static char *string_dup(const char *s){
26  char *str = malloc(strlen(s) + 1);
27  strcpy(str, s);
28  return str;
29}
30
31typedef struct simple_tokenizer {
32  sqlite3_tokenizer base;
33  const char *zDelim;          /* token delimiters */
34} simple_tokenizer;
35
36typedef struct simple_tokenizer_cursor {
37  sqlite3_tokenizer_cursor base;
38  const char *pInput;          /* input we are tokenizing */
39  int nBytes;                  /* size of the input */
40  const char *pCurrent;        /* current position in pInput */
41  int iToken;                  /* index of next token to be returned */
42  char *zToken;                /* storage for current token */
43  int nTokenBytes;             /* actual size of current token */
44  int nTokenAllocated;         /* space allocated to zToken buffer */
45} simple_tokenizer_cursor;
46
47static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
48
49static int simpleCreate(
50  int argc, const char **argv,
51  sqlite3_tokenizer **ppTokenizer
52){
53  simple_tokenizer *t;
54
55  t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
56  /* TODO(shess) Delimiters need to remain the same from run to run,
57  ** else we need to reindex.  One solution would be a meta-table to
58  ** track such information in the database, then we'd only want this
59  ** information on the initial create.
60  */
61  if( argc>1 ){
62    t->zDelim = string_dup(argv[1]);
63  } else {
64    /* Build a string excluding alphanumeric ASCII characters */
65    char zDelim[0x80];               /* nul-terminated, so nul not a member */
66    int i, j;
67    for(i=1, j=0; i<0x80; i++){
68      if( !isalnum(i) ){
69        zDelim[j++] = i;
70      }
71    }
72    zDelim[j++] = '\0';
73    assert( j<=sizeof(zDelim) );
74    t->zDelim = string_dup(zDelim);
75  }
76
77  *ppTokenizer = &t->base;
78  return SQLITE_OK;
79}
80
81static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
82  simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
83
84  free((void *) t->zDelim);
85  free(t);
86
87  return SQLITE_OK;
88}
89
90static int simpleOpen(
91  sqlite3_tokenizer *pTokenizer,
92  const char *pInput, int nBytes,
93  sqlite3_tokenizer_cursor **ppCursor
94){
95  simple_tokenizer_cursor *c;
96
97  c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
98  c->pInput = pInput;
99  c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
100  c->pCurrent = c->pInput;        /* start tokenizing at the beginning */
101  c->iToken = 0;
102  c->zToken = NULL;               /* no space allocated, yet. */
103  c->nTokenBytes = 0;
104  c->nTokenAllocated = 0;
105
106  *ppCursor = &c->base;
107  return SQLITE_OK;
108}
109
110static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
111  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
112
113  if( NULL!=c->zToken ){
114    free(c->zToken);
115  }
116  free(c);
117
118  return SQLITE_OK;
119}
120
121static int simpleNext(
122  sqlite3_tokenizer_cursor *pCursor,
123  const char **ppToken, int *pnBytes,
124  int *piStartOffset, int *piEndOffset, int *piPosition
125){
126  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
127  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
128  int ii;
129
130  while( c->pCurrent-c->pInput<c->nBytes ){
131    int n = (int) strcspn(c->pCurrent, t->zDelim);
132    if( n>0 ){
133      if( n+1>c->nTokenAllocated ){
134        c->zToken = realloc(c->zToken, n+1);
135      }
136      for(ii=0; ii<n; ii++){
137        /* TODO(shess) This needs expansion to handle UTF-8
138        ** case-insensitivity.
139        */
140        char ch = c->pCurrent[ii];
141        c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
142      }
143      c->zToken[n] = '\0';
144      *ppToken = c->zToken;
145      *pnBytes = n;
146      *piStartOffset = (int) (c->pCurrent-c->pInput);
147      *piEndOffset = *piStartOffset+n;
148      *piPosition = c->iToken++;
149      c->pCurrent += n + 1;
150
151      return SQLITE_OK;
152    }
153    c->pCurrent += n + 1;
154    /* TODO(shess) could strspn() to skip delimiters en masse.  Needs
155    ** to happen in two places, though, which is annoying.
156    */
157  }
158  return SQLITE_DONE;
159}
160
161static sqlite3_tokenizer_module simpleTokenizerModule = {
162  0,
163  simpleCreate,
164  simpleDestroy,
165  simpleOpen,
166  simpleClose,
167  simpleNext,
168};
169
170void get_simple_tokenizer_module(
171  sqlite3_tokenizer_module **ppModule
172){
173  *ppModule = &simpleTokenizerModule;
174}
175