1/*
2** 2009 Nov 12
3**
4** The author disclaims copyright to this source code.  In place of
5** a legal notice, here is a blessing:
6**
7**    May you do good and not evil.
8**    May you find forgiveness for yourself and forgive others.
9**    May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13*/
14
15#ifndef _FTSINT_H
16#define _FTSINT_H
17
18#if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
19# define NDEBUG 1
20#endif
21
22#include "sqlite3.h"
23#include "fts3_tokenizer.h"
24#include "fts3_hash.h"
25
26/*
27** This constant controls how often segments are merged. Once there are
28** FTS3_MERGE_COUNT segments of level N, they are merged into a single
29** segment of level N+1.
30*/
31#define FTS3_MERGE_COUNT 16
32
33/*
34** This is the maximum amount of data (in bytes) to store in the
35** Fts3Table.pendingTerms hash table. Normally, the hash table is
36** populated as documents are inserted/updated/deleted in a transaction
37** and used to create a new segment when the transaction is committed.
38** However if this limit is reached midway through a transaction, a new
39** segment is created and the hash table cleared immediately.
40*/
41#define FTS3_MAX_PENDING_DATA (1*1024*1024)
42
43/*
44** Macro to return the number of elements in an array. SQLite has a
45** similar macro called ArraySize(). Use a different name to avoid
46** a collision when building an amalgamation with built-in FTS3.
47*/
48#define SizeofArray(X) ((int)(sizeof(X)/sizeof(X[0])))
49
50/*
51** Maximum length of a varint encoded integer. The varint format is different
52** from that used by SQLite, so the maximum length is 10, not 9.
53*/
54#define FTS3_VARINT_MAX 10
55
56/*
57** The testcase() macro is only used by the amalgamation.  If undefined,
58** make it a no-op.
59*/
60#ifndef testcase
61# define testcase(X)
62#endif
63
64/*
65** Terminator values for position-lists and column-lists.
66*/
67#define POS_COLUMN  (1)     /* Column-list terminator */
68#define POS_END     (0)     /* Position-list terminator */
69
70/*
71** This section provides definitions to allow the
72** FTS3 extension to be compiled outside of the
73** amalgamation.
74*/
75#ifndef SQLITE_AMALGAMATION
76/*
77** Macros indicating that conditional expressions are always true or
78** false.
79*/
80#ifdef SQLITE_COVERAGE_TEST
81# define ALWAYS(x) (1)
82# define NEVER(X)  (0)
83#else
84# define ALWAYS(x) (x)
85# define NEVER(X)  (x)
86#endif
87
88/*
89** Internal types used by SQLite.
90*/
91typedef unsigned char u8;         /* 1-byte (or larger) unsigned integer */
92typedef short int i16;            /* 2-byte (or larger) signed integer */
93typedef unsigned int u32;         /* 4-byte unsigned integer */
94typedef sqlite3_uint64 u64;       /* 8-byte unsigned integer */
95/*
96** Macro used to suppress compiler warnings for unused parameters.
97*/
98#define UNUSED_PARAMETER(x) (void)(x)
99#endif
100
101typedef struct Fts3Table Fts3Table;
102typedef struct Fts3Cursor Fts3Cursor;
103typedef struct Fts3Expr Fts3Expr;
104typedef struct Fts3Phrase Fts3Phrase;
105typedef struct Fts3PhraseToken Fts3PhraseToken;
106
107typedef struct Fts3SegFilter Fts3SegFilter;
108typedef struct Fts3DeferredToken Fts3DeferredToken;
109typedef struct Fts3SegReader Fts3SegReader;
110typedef struct Fts3SegReaderCursor Fts3SegReaderCursor;
111
112/*
113** A connection to a fulltext index is an instance of the following
114** structure. The xCreate and xConnect methods create an instance
115** of this structure and xDestroy and xDisconnect free that instance.
116** All other methods receive a pointer to the structure as one of their
117** arguments.
118*/
119struct Fts3Table {
120  sqlite3_vtab base;              /* Base class used by SQLite core */
121  sqlite3 *db;                    /* The database connection */
122  const char *zDb;                /* logical database name */
123  const char *zName;              /* virtual table name */
124  int nColumn;                    /* number of named columns in virtual table */
125  char **azColumn;                /* column names.  malloced */
126  sqlite3_tokenizer *pTokenizer;  /* tokenizer for inserts and queries */
127
128  /* Precompiled statements used by the implementation. Each of these
129  ** statements is run and reset within a single virtual table API call.
130  */
131  sqlite3_stmt *aStmt[24];
132
133  char *zReadExprlist;
134  char *zWriteExprlist;
135
136  int nNodeSize;                  /* Soft limit for node size */
137  u8 bHasStat;                    /* True if %_stat table exists */
138  u8 bHasDocsize;                 /* True if %_docsize table exists */
139  int nPgsz;                      /* Page size for host database */
140  char *zSegmentsTbl;             /* Name of %_segments table */
141  sqlite3_blob *pSegments;        /* Blob handle open on %_segments table */
142
143  /* The following hash table is used to buffer pending index updates during
144  ** transactions. Variable nPendingData estimates the memory size of the
145  ** pending data, including hash table overhead, but not malloc overhead.
146  ** When nPendingData exceeds nMaxPendingData, the buffer is flushed
147  ** automatically. Variable iPrevDocid is the docid of the most recently
148  ** inserted record.
149  */
150  int nMaxPendingData;
151  int nPendingData;
152  sqlite_int64 iPrevDocid;
153  Fts3Hash pendingTerms;
154};
155
156/*
157** When the core wants to read from the virtual table, it creates a
158** virtual table cursor (an instance of the following structure) using
159** the xOpen method. Cursors are destroyed using the xClose method.
160*/
161struct Fts3Cursor {
162  sqlite3_vtab_cursor base;       /* Base class used by SQLite core */
163  i16 eSearch;                    /* Search strategy (see below) */
164  u8 isEof;                       /* True if at End Of Results */
165  u8 isRequireSeek;               /* True if must seek pStmt to %_content row */
166  sqlite3_stmt *pStmt;            /* Prepared statement in use by the cursor */
167  Fts3Expr *pExpr;                /* Parsed MATCH query string */
168  int nPhrase;                    /* Number of matchable phrases in query */
169  Fts3DeferredToken *pDeferred;   /* Deferred search tokens, if any */
170  sqlite3_int64 iPrevId;          /* Previous id read from aDoclist */
171  char *pNextId;                  /* Pointer into the body of aDoclist */
172  char *aDoclist;                 /* List of docids for full-text queries */
173  int nDoclist;                   /* Size of buffer at aDoclist */
174  int eEvalmode;                  /* An FTS3_EVAL_XX constant */
175  int nRowAvg;                    /* Average size of database rows, in pages */
176
177  int isMatchinfoNeeded;          /* True when aMatchinfo[] needs filling in */
178  u32 *aMatchinfo;                /* Information about most recent match */
179  int nMatchinfo;                 /* Number of elements in aMatchinfo[] */
180  char *zMatchinfo;               /* Matchinfo specification */
181};
182
183#define FTS3_EVAL_FILTER    0
184#define FTS3_EVAL_NEXT      1
185#define FTS3_EVAL_MATCHINFO 2
186
187/*
188** The Fts3Cursor.eSearch member is always set to one of the following.
189** Actualy, Fts3Cursor.eSearch can be greater than or equal to
190** FTS3_FULLTEXT_SEARCH.  If so, then Fts3Cursor.eSearch - 2 is the index
191** of the column to be searched.  For example, in
192**
193**     CREATE VIRTUAL TABLE ex1 USING fts3(a,b,c,d);
194**     SELECT docid FROM ex1 WHERE b MATCH 'one two three';
195**
196** Because the LHS of the MATCH operator is 2nd column "b",
197** Fts3Cursor.eSearch will be set to FTS3_FULLTEXT_SEARCH+1.  (+0 for a,
198** +1 for b, +2 for c, +3 for d.)  If the LHS of MATCH were "ex1"
199** indicating that all columns should be searched,
200** then eSearch would be set to FTS3_FULLTEXT_SEARCH+4.
201*/
202#define FTS3_FULLSCAN_SEARCH 0    /* Linear scan of %_content table */
203#define FTS3_DOCID_SEARCH    1    /* Lookup by rowid on %_content table */
204#define FTS3_FULLTEXT_SEARCH 2    /* Full-text index search */
205
206/*
207** A "phrase" is a sequence of one or more tokens that must match in
208** sequence.  A single token is the base case and the most common case.
209** For a sequence of tokens contained in double-quotes (i.e. "one two three")
210** nToken will be the number of tokens in the string.
211**
212** The nDocMatch and nMatch variables contain data that may be used by the
213** matchinfo() function. They are populated when the full-text index is
214** queried for hits on the phrase. If one or more tokens in the phrase
215** are deferred, the nDocMatch and nMatch variables are populated based
216** on the assumption that the
217*/
218struct Fts3PhraseToken {
219  char *z;                        /* Text of the token */
220  int n;                          /* Number of bytes in buffer z */
221  int isPrefix;                   /* True if token ends with a "*" character */
222  int bFulltext;                  /* True if full-text index was used */
223  Fts3SegReaderCursor *pSegcsr;   /* Segment-reader for this token */
224  Fts3DeferredToken *pDeferred;   /* Deferred token object for this token */
225};
226
227struct Fts3Phrase {
228  /* Variables populated by fts3_expr.c when parsing a MATCH expression */
229  int nToken;                /* Number of tokens in the phrase */
230  int iColumn;               /* Index of column this phrase must match */
231  int isNot;                 /* Phrase prefixed by unary not (-) operator */
232  Fts3PhraseToken aToken[1]; /* One entry for each token in the phrase */
233};
234
235/*
236** A tree of these objects forms the RHS of a MATCH operator.
237**
238** If Fts3Expr.eType is either FTSQUERY_NEAR or FTSQUERY_PHRASE and isLoaded
239** is true, then aDoclist points to a malloced buffer, size nDoclist bytes,
240** containing the results of the NEAR or phrase query in FTS3 doclist
241** format. As usual, the initial "Length" field found in doclists stored
242** on disk is omitted from this buffer.
243**
244** Variable pCurrent always points to the start of a docid field within
245** aDoclist. Since the doclist is usually scanned in docid order, this can
246** be used to accelerate seeking to the required docid within the doclist.
247*/
248struct Fts3Expr {
249  int eType;                 /* One of the FTSQUERY_XXX values defined below */
250  int nNear;                 /* Valid if eType==FTSQUERY_NEAR */
251  Fts3Expr *pParent;         /* pParent->pLeft==this or pParent->pRight==this */
252  Fts3Expr *pLeft;           /* Left operand */
253  Fts3Expr *pRight;          /* Right operand */
254  Fts3Phrase *pPhrase;       /* Valid if eType==FTSQUERY_PHRASE */
255
256  int isLoaded;              /* True if aDoclist/nDoclist are initialized. */
257  char *aDoclist;            /* Buffer containing doclist */
258  int nDoclist;              /* Size of aDoclist in bytes */
259
260  sqlite3_int64 iCurrent;
261  char *pCurrent;
262};
263
264/*
265** Candidate values for Fts3Query.eType. Note that the order of the first
266** four values is in order of precedence when parsing expressions. For
267** example, the following:
268**
269**   "a OR b AND c NOT d NEAR e"
270**
271** is equivalent to:
272**
273**   "a OR (b AND (c NOT (d NEAR e)))"
274*/
275#define FTSQUERY_NEAR   1
276#define FTSQUERY_NOT    2
277#define FTSQUERY_AND    3
278#define FTSQUERY_OR     4
279#define FTSQUERY_PHRASE 5
280
281
282/* fts3_write.c */
283int sqlite3Fts3UpdateMethod(sqlite3_vtab*,int,sqlite3_value**,sqlite3_int64*);
284int sqlite3Fts3PendingTermsFlush(Fts3Table *);
285void sqlite3Fts3PendingTermsClear(Fts3Table *);
286int sqlite3Fts3Optimize(Fts3Table *);
287int sqlite3Fts3SegReaderNew(int, sqlite3_int64,
288  sqlite3_int64, sqlite3_int64, const char *, int, Fts3SegReader**);
289int sqlite3Fts3SegReaderPending(Fts3Table*,const char*,int,int,Fts3SegReader**);
290void sqlite3Fts3SegReaderFree(Fts3SegReader *);
291int sqlite3Fts3SegReaderCost(Fts3Cursor *, Fts3SegReader *, int *);
292int sqlite3Fts3AllSegdirs(Fts3Table*, int, sqlite3_stmt **);
293int sqlite3Fts3ReadLock(Fts3Table *);
294int sqlite3Fts3ReadBlock(Fts3Table*, sqlite3_int64, char **, int*);
295
296int sqlite3Fts3SelectDoctotal(Fts3Table *, sqlite3_stmt **);
297int sqlite3Fts3SelectDocsize(Fts3Table *, sqlite3_int64, sqlite3_stmt **);
298
299void sqlite3Fts3FreeDeferredTokens(Fts3Cursor *);
300int sqlite3Fts3DeferToken(Fts3Cursor *, Fts3PhraseToken *, int);
301int sqlite3Fts3CacheDeferredDoclists(Fts3Cursor *);
302void sqlite3Fts3FreeDeferredDoclists(Fts3Cursor *);
303char *sqlite3Fts3DeferredDoclist(Fts3DeferredToken *, int *);
304void sqlite3Fts3SegmentsClose(Fts3Table *);
305
306#define FTS3_SEGCURSOR_PENDING -1
307#define FTS3_SEGCURSOR_ALL     -2
308
309int sqlite3Fts3SegReaderStart(Fts3Table*, Fts3SegReaderCursor*, Fts3SegFilter*);
310int sqlite3Fts3SegReaderStep(Fts3Table *, Fts3SegReaderCursor *);
311void sqlite3Fts3SegReaderFinish(Fts3SegReaderCursor *);
312int sqlite3Fts3SegReaderCursor(
313    Fts3Table *, int, const char *, int, int, int, Fts3SegReaderCursor *);
314
315/* Flags allowed as part of the 4th argument to SegmentReaderIterate() */
316#define FTS3_SEGMENT_REQUIRE_POS   0x00000001
317#define FTS3_SEGMENT_IGNORE_EMPTY  0x00000002
318#define FTS3_SEGMENT_COLUMN_FILTER 0x00000004
319#define FTS3_SEGMENT_PREFIX        0x00000008
320#define FTS3_SEGMENT_SCAN          0x00000010
321
322/* Type passed as 4th argument to SegmentReaderIterate() */
323struct Fts3SegFilter {
324  const char *zTerm;
325  int nTerm;
326  int iCol;
327  int flags;
328};
329
330struct Fts3SegReaderCursor {
331  /* Used internally by sqlite3Fts3SegReaderXXX() calls */
332  Fts3SegReader **apSegment;      /* Array of Fts3SegReader objects */
333  int nSegment;                   /* Size of apSegment array */
334  int nAdvance;                   /* How many seg-readers to advance */
335  Fts3SegFilter *pFilter;         /* Pointer to filter object */
336  char *aBuffer;                  /* Buffer to merge doclists in */
337  int nBuffer;                    /* Allocated size of aBuffer[] in bytes */
338
339  /* Cost of running this iterator. Used by fts3.c only. */
340  int nCost;
341
342  /* Output values. Valid only after Fts3SegReaderStep() returns SQLITE_ROW. */
343  char *zTerm;                    /* Pointer to term buffer */
344  int nTerm;                      /* Size of zTerm in bytes */
345  char *aDoclist;                 /* Pointer to doclist buffer */
346  int nDoclist;                   /* Size of aDoclist[] in bytes */
347};
348
349/* fts3.c */
350int sqlite3Fts3PutVarint(char *, sqlite3_int64);
351int sqlite3Fts3GetVarint(const char *, sqlite_int64 *);
352int sqlite3Fts3GetVarint32(const char *, int *);
353int sqlite3Fts3VarintLen(sqlite3_uint64);
354void sqlite3Fts3Dequote(char *);
355
356char *sqlite3Fts3FindPositions(Fts3Expr *, sqlite3_int64, int);
357int sqlite3Fts3ExprLoadDoclist(Fts3Cursor *, Fts3Expr *);
358int sqlite3Fts3ExprLoadFtDoclist(Fts3Cursor *, Fts3Expr *, char **, int *);
359int sqlite3Fts3ExprNearTrim(Fts3Expr *, Fts3Expr *, int);
360
361/* fts3_tokenizer.c */
362const char *sqlite3Fts3NextToken(const char *, int *);
363int sqlite3Fts3InitHashTable(sqlite3 *, Fts3Hash *, const char *);
364int sqlite3Fts3InitTokenizer(Fts3Hash *pHash, const char *,
365    sqlite3_tokenizer **, char **
366);
367int sqlite3Fts3IsIdChar(char);
368
369/* fts3_snippet.c */
370void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*);
371void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *,
372  const char *, const char *, int, int
373);
374void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *);
375
376/* fts3_expr.c */
377int sqlite3Fts3ExprParse(sqlite3_tokenizer *,
378  char **, int, int, const char *, int, Fts3Expr **
379);
380void sqlite3Fts3ExprFree(Fts3Expr *);
381#ifdef SQLITE_TEST
382int sqlite3Fts3ExprInitTestInterface(sqlite3 *db);
383#endif
384
385/* fts3_aux.c */
386int sqlite3Fts3InitAux(sqlite3 *db);
387
388#endif /* _FTSINT_H */
389