15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** The author disclaims copyright to this source code. 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)************************************************************************* 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** Implementation of the "simple" full-text-search tokenizer. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/ 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <assert.h> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if !defined(__APPLE__) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <malloc.h> 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <stdlib.h> 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <stdio.h> 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string.h> 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <ctype.h> 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "tokenizer.h" 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* Duplicate a string; the caller must free() the returned string. 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * (We don't use strdup() since it's not part of the standard C library and 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * may not be available everywhere.) */ 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* TODO(shess) Copied from fulltext.c, consider util.c for such 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)** things. */ 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static char *string_dup(const char *s){ 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char *str = malloc(strlen(s) + 1); 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) strcpy(str, s); 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return str; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef struct simple_tokenizer { 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sqlite3_tokenizer base; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char *zDelim; /* token delimiters */ 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} simple_tokenizer; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef struct simple_tokenizer_cursor { 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sqlite3_tokenizer_cursor base; 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char *pInput; /* input we are tokenizing */ 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nBytes; /* size of the input */ 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char *pCurrent; /* current position in pInput */ 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int iToken; /* index of next token to be returned */ 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char *zToken; /* storage for current token */ 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nTokenBytes; /* actual size of current token */ 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nTokenAllocated; /* space allocated to zToken buffer */ 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} simple_tokenizer_cursor; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */ 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int simpleCreate( 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int argc, const char **argv, 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sqlite3_tokenizer **ppTokenizer 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){ 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simple_tokenizer *t; 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer)); 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* TODO(shess) Delimiters need to remain the same from run to run, 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ** else we need to reindex. One solution would be a meta-table to 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ** track such information in the database, then we'd only want this 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ** information on the initial create. 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if( argc>1 ){ 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t->zDelim = string_dup(argv[1]); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* Build a string excluding alphanumeric ASCII characters */ 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char zDelim[0x80]; /* nul-terminated, so nul not a member */ 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int i, j; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for(i=1, j=0; i<0x80; i++){ 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if( !isalnum(i) ){ 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) zDelim[j++] = i; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) zDelim[j++] = '\0'; 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) assert( j<=sizeof(zDelim) ); 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) t->zDelim = string_dup(zDelim); 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *ppTokenizer = &t->base; 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return SQLITE_OK; 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simple_tokenizer *t = (simple_tokenizer *) pTokenizer; 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) free((void *) t->zDelim); 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) free(t); 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return SQLITE_OK; 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int simpleOpen( 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sqlite3_tokenizer *pTokenizer, 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char *pInput, int nBytes, 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sqlite3_tokenizer_cursor **ppCursor 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){ 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simple_tokenizer_cursor *c; 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor)); 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->pInput = pInput; 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes; 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->pCurrent = c->pInput; /* start tokenizing at the beginning */ 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->iToken = 0; 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->zToken = NULL; /* no space allocated, yet. */ 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->nTokenBytes = 0; 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->nTokenAllocated = 0; 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *ppCursor = &c->base; 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return SQLITE_OK; 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if( NULL!=c->zToken ){ 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) free(c->zToken); 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) free(c); 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return SQLITE_OK; 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int simpleNext( 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sqlite3_tokenizer_cursor *pCursor, 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char **ppToken, int *pnBytes, 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int *piStartOffset, int *piEndOffset, int *piPosition 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){ 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int ii; 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while( c->pCurrent-c->pInput<c->nBytes ){ 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = (int) strcspn(c->pCurrent, t->zDelim); 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if( n>0 ){ 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if( n+1>c->nTokenAllocated ){ 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->zToken = realloc(c->zToken, n+1); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for(ii=0; ii<n; ii++){ 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* TODO(shess) This needs expansion to handle UTF-8 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ** case-insensitivity. 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char ch = c->pCurrent[ii]; 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch; 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->zToken[n] = '\0'; 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *ppToken = c->zToken; 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pnBytes = n; 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *piStartOffset = (int) (c->pCurrent-c->pInput); 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *piEndOffset = *piStartOffset+n; 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *piPosition = c->iToken++; 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->pCurrent += n + 1; 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return SQLITE_OK; 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c->pCurrent += n + 1; 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* TODO(shess) could strspn() to skip delimiters en masse. Needs 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ** to happen in two places, though, which is annoying. 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return SQLITE_DONE; 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static sqlite3_tokenizer_module simpleTokenizerModule = { 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0, 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simpleCreate, 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simpleDestroy, 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simpleOpen, 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simpleClose, 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) simpleNext, 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void get_simple_tokenizer_module( 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sqlite3_tokenizer_module **ppModule 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)){ 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *ppModule = &simpleTokenizerModule; 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 175