1/*
2** 2012 Jan 11
3**
4** The author disclaims copyright to this source code.  In place of
5** a legal notice, here is a blessing:
6**
7**    May you do good and not evil.
8**    May you find forgiveness for yourself and forgive others.
9**    May you share freely, never taking more than you give.
10*/
11/* TODO(shess): THIS MODULE IS STILL EXPERIMENTAL.  DO NOT USE IT. */
12/* Implements a virtual table "recover" which can be used to recover
13 * data from a corrupt table.  The table is walked manually, with
14 * corrupt items skipped.  Additionally, any errors while reading will
15 * be skipped.
16 *
17 * Given a table with this definition:
18 *
19 * CREATE TABLE Stuff (
20 *   name TEXT PRIMARY KEY,
21 *   value TEXT NOT NULL
22 * );
23 *
24 * to recover the data from teh table, you could do something like:
25 *
26 * -- Attach another database, the original is not trustworthy.
27 * ATTACH DATABASE '/tmp/db.db' AS rdb;
28 * -- Create a new version of the table.
29 * CREATE TABLE rdb.Stuff (
30 *   name TEXT PRIMARY KEY,
31 *   value TEXT NOT NULL
32 * );
33 * -- This will read the original table's data.
34 * CREATE VIRTUAL TABLE temp.recover_Stuff using recover(
35 *   main.Stuff,
36 *   name TEXT STRICT NOT NULL,  -- only real TEXT data allowed
37 *   value TEXT STRICT NOT NULL
38 * );
39 * -- Corruption means the UNIQUE constraint may no longer hold for
40 * -- Stuff, so either OR REPLACE or OR IGNORE must be used.
41 * INSERT OR REPLACE INTO rdb.Stuff (rowid, name, value )
42 *   SELECT rowid, name, value FROM temp.recover_Stuff;
43 * DROP TABLE temp.recover_Stuff;
44 * DETACH DATABASE rdb;
45 * -- Move db.db to replace original db in filesystem.
46 *
47 *
48 * Usage
49 *
50 * Given the goal of dealing with corruption, it would not be safe to
51 * create a recovery table in the database being recovered.  So
52 * recovery tables must be created in the temp database.  They are not
53 * appropriate to persist, in any case.  [As a bonus, sqlite_master
54 * tables can be recovered.  Perhaps more cute than useful, though.]
55 *
56 * The parameters are a specifier for the table to read, and a column
57 * definition for each bit of data stored in that table.  The named
58 * table must be convertable to a root page number by reading the
59 * sqlite_master table.  Bare table names are assumed to be in
60 * database 0 ("main"), other databases can be specified in db.table
61 * fashion.
62 *
63 * Column definitions are similar to BUT NOT THE SAME AS those
64 * provided to CREATE statements:
65 *  column-def: column-name [type-name [STRICT] [NOT NULL]]
66 *  type-name: (ANY|ROWID|INTEGER|FLOAT|NUMERIC|TEXT|BLOB)
67 *
68 * Only those exact type names are accepted, there is no type
69 * intuition.  The only constraints accepted are STRICT (see below)
70 * and NOT NULL.  Anything unexpected will cause the create to fail.
71 *
72 * ANY is a convenience to indicate that manifest typing is desired.
73 * It is equivalent to not specifying a type at all.  The results for
74 * such columns will have the type of the data's storage.  The exposed
75 * schema will contain no type for that column.
76 *
77 * ROWID is used for columns representing aliases to the rowid
78 * (INTEGER PRIMARY KEY, with or without AUTOINCREMENT), to make the
79 * concept explicit.  Such columns are actually stored as NULL, so
80 * they cannot be simply ignored.  The exposed schema will be INTEGER
81 * for that column.
82 *
83 * NOT NULL causes rows with a NULL in that column to be skipped.  It
84 * also adds NOT NULL to the column in the exposed schema.  If the
85 * table has ever had columns added using ALTER TABLE, then those
86 * columns implicitly contain NULL for rows which have not been
87 * updated.  [Workaround using COALESCE() in your SELECT statement.]
88 *
89 * The created table is read-only, with no indices.  Any SELECT will
90 * be a full-table scan, returning each valid row read from the
91 * storage of the backing table.  The rowid will be the rowid of the
92 * row from the backing table.  "Valid" means:
93 * - The cell metadata for the row is well-formed.  Mainly this means that
94 *   the cell header info describes a payload of the size indicated by
95 *   the cell's payload size.
96 * - The cell does not run off the page.
97 * - The cell does not overlap any other cell on the page.
98 * - The cell contains doesn't contain too many columns.
99 * - The types of the serialized data match the indicated types (see below).
100 *
101 *
102 * Type affinity versus type storage.
103 *
104 * http://www.sqlite.org/datatype3.html describes SQLite's type
105 * affinity system.  The system provides for automated coercion of
106 * types in certain cases, transparently enough that many developers
107 * do not realize that it is happening.  Importantly, it implies that
108 * the raw data stored in the database may not have the obvious type.
109 *
110 * Differences between the stored data types and the expected data
111 * types may be a signal of corruption.  This module makes some
112 * allowances for automatic coercion.  It is important to be concious
113 * of the difference between the schema exposed by the module, and the
114 * data types read from storage.  The following table describes how
115 * the module interprets things:
116 *
117 * type     schema   data                     STRICT
118 * ----     ------   ----                     ------
119 * ANY      <none>   any                      any
120 * ROWID    INTEGER  n/a                      n/a
121 * INTEGER  INTEGER  integer                  integer
122 * FLOAT    FLOAT    integer or float         float
123 * NUMERIC  NUMERIC  integer, float, or text  integer or float
124 * TEXT     TEXT     text or blob             text
125 * BLOB     BLOB     blob                     blob
126 *
127 * type is the type provided to the recover module, schema is the
128 * schema exposed by the module, data is the acceptable types of data
129 * decoded from storage, and STRICT is a modification of that.
130 *
131 * A very loose recovery system might use ANY for all columns, then
132 * use the appropriate sqlite3_column_*() calls to coerce to expected
133 * types.  This doesn't provide much protection if a page from a
134 * different table with the same column count is linked into an
135 * inappropriate btree.
136 *
137 * A very tight recovery system might use STRICT to enforce typing on
138 * all columns, preferring to skip rows which are valid at the storage
139 * level but don't contain the right types.  Note that FLOAT STRICT is
140 * almost certainly not appropriate, since integral values are
141 * transparently stored as integers, when that is more efficient.
142 *
143 * Another option is to use ANY for all columns and inspect each
144 * result manually (using sqlite3_column_*).  This should only be
145 * necessary in cases where developers have used manifest typing (test
146 * to make sure before you decide that you aren't using manifest
147 * typing!).
148 *
149 *
150 * Caveats
151 *
152 * Leaf pages not referenced by interior nodes will not be found.
153 *
154 * Leaf pages referenced from interior nodes of other tables will not
155 * be resolved.
156 *
157 * Rows referencing invalid overflow pages will be skipped.
158 *
159 * SQlite rows have a header which describes how to interpret the rest
160 * of the payload.  The header can be valid in cases where the rest of
161 * the record is actually corrupt (in the sense that the data is not
162 * the intended data).  This can especially happen WRT overflow pages,
163 * as lack of atomic updates between pages is the primary form of
164 * corruption I have seen in the wild.
165 */
166/* The implementation is via a series of cursors.  The cursor
167 * implementations follow the pattern:
168 *
169 * // Creates the cursor using various initialization info.
170 * int cursorCreate(...);
171 *
172 * // Returns 1 if there is no more data, 0 otherwise.
173 * int cursorEOF(Cursor *pCursor);
174 *
175 * // Various accessors can be used if not at EOF.
176 *
177 * // Move to the next item.
178 * int cursorNext(Cursor *pCursor);
179 *
180 * // Destroy the memory associated with the cursor.
181 * void cursorDestroy(Cursor *pCursor);
182 *
183 * References in the following are to sections at
184 * http://www.sqlite.org/fileformat2.html .
185 *
186 * RecoverLeafCursor iterates the records in a leaf table node
187 * described in section 1.5 "B-tree Pages".  When the node is
188 * exhausted, an interior cursor is used to get the next leaf node,
189 * and iteration continues there.
190 *
191 * RecoverInteriorCursor iterates the child pages in an interior table
192 * node described in section 1.5 "B-tree Pages".  When the node is
193 * exhausted, a parent interior cursor is used to get the next
194 * interior node at the same level, and iteration continues there.
195 *
196 * Together these record the path from the leaf level to the root of
197 * the tree.  Iteration happens from the leaves rather than the root
198 * both for efficiency and putting the special case at the front of
199 * the list is easier to implement.
200 *
201 * RecoverCursor uses a RecoverLeafCursor to iterate the rows of a
202 * table, returning results via the SQLite virtual table interface.
203 */
204/* TODO(shess): It might be useful to allow DEFAULT in types to
205 * specify what to do for NULL when an ALTER TABLE case comes up.
206 * Unfortunately, simply adding it to the exposed schema and using
207 * sqlite3_result_null() does not cause the default to be generate.
208 * Handling it ourselves seems hard, unfortunately.
209 */
210
211#include <assert.h>
212#include <ctype.h>
213#include <stdio.h>
214#include <string.h>
215
216/* Internal SQLite things that are used:
217 * u32, u64, i64 types.
218 * Btree, Pager, and DbPage structs.
219 * DbPage.pData, .pPager, and .pgno
220 * sqlite3 struct.
221 * sqlite3BtreePager() and sqlite3BtreeGetPageSize()
222 * sqlite3PagerAcquire() and sqlite3PagerUnref()
223 * getVarint().
224 */
225#include "sqliteInt.h"
226
227/* For debugging. */
228#if 0
229#define FNENTRY() fprintf(stderr, "In %s\n", __FUNCTION__)
230#else
231#define FNENTRY()
232#endif
233
234/* Generic constants and helper functions. */
235
236static const unsigned char kTableLeafPage = 0x0D;
237static const unsigned char kTableInteriorPage = 0x05;
238
239/* From section 1.5. */
240static const unsigned kiPageTypeOffset = 0;
241static const unsigned kiPageFreeBlockOffset = 1;
242static const unsigned kiPageCellCountOffset = 3;
243static const unsigned kiPageCellContentOffset = 5;
244static const unsigned kiPageFragmentedBytesOffset = 7;
245static const unsigned knPageLeafHeaderBytes = 8;
246/* Interior pages contain an additional field. */
247static const unsigned kiPageRightChildOffset = 8;
248static const unsigned kiPageInteriorHeaderBytes = 12;
249
250/* Accepted types are specified by a mask. */
251#define MASK_ROWID (1<<0)
252#define MASK_INTEGER (1<<1)
253#define MASK_FLOAT (1<<2)
254#define MASK_TEXT (1<<3)
255#define MASK_BLOB (1<<4)
256#define MASK_NULL (1<<5)
257
258/* Helpers to decode fixed-size fields. */
259static u32 decodeUnsigned16(const unsigned char *pData){
260  return (pData[0]<<8) + pData[1];
261}
262static u32 decodeUnsigned32(const unsigned char *pData){
263  return (decodeUnsigned16(pData)<<16) + decodeUnsigned16(pData+2);
264}
265static i64 decodeSigned(const unsigned char *pData, unsigned nBytes){
266  i64 r = (char)(*pData);
267  while( --nBytes ){
268    r <<= 8;
269    r += *(++pData);
270  }
271  return r;
272}
273/* Derived from vdbeaux.c, sqlite3VdbeSerialGet(), case 7. */
274/* TODO(shess): Determine if swapMixedEndianFloat() applies. */
275static double decodeFloat64(const unsigned char *pData){
276#if !defined(NDEBUG)
277  static const u64 t1 = ((u64)0x3ff00000)<<32;
278  static const double r1 = 1.0;
279  u64 t2 = t1;
280  assert( sizeof(r1)==sizeof(t2) && memcmp(&r1, &t2, sizeof(r1))==0 );
281#endif
282  i64 x = decodeSigned(pData, 8);
283  double d;
284  memcpy(&d, &x, sizeof(x));
285  return d;
286}
287
288/* Return true if a varint can safely be read from pData/nData. */
289/* TODO(shess): DbPage points into the middle of a buffer which
290 * contains the page data before DbPage.  So code should always be
291 * able to read a small number of varints safely.  Consider whether to
292 * trust that or not.
293 */
294static int checkVarint(const unsigned char *pData, unsigned nData){
295  unsigned i;
296
297  /* In the worst case the decoder takes all 8 bits of the 9th byte. */
298  if( nData>=9 ){
299    return 1;
300  }
301
302  /* Look for a high-bit-clear byte in what's left. */
303  for( i=0; i<nData; ++i ){
304    if( !(pData[i]&0x80) ){
305      return 1;
306    }
307  }
308
309  /* Cannot decode in the space given. */
310  return 0;
311}
312
313/* Return 1 if n varints can be read from pData/nData. */
314static int checkVarints(const unsigned char *pData, unsigned nData,
315                        unsigned n){
316  unsigned nCur = 0;   /* Byte offset within current varint. */
317  unsigned nFound = 0; /* Number of varints found. */
318  unsigned i;
319
320  /* In the worst case the decoder takes all 8 bits of the 9th byte. */
321  if( nData>=9*n ){
322    return 1;
323  }
324
325  for( i=0; nFound<n && i<nData; ++i ){
326    nCur++;
327    if( nCur==9 || !(pData[i]&0x80) ){
328      nFound++;
329      nCur = 0;
330    }
331  }
332
333  return nFound==n;
334}
335
336/* ctype and str[n]casecmp() can be affected by locale (eg, tr_TR).
337 * These versions consider only the ASCII space.
338 */
339/* TODO(shess): It may be reasonable to just remove the need for these
340 * entirely.  The module could require "TEXT STRICT NOT NULL", not
341 * "Text Strict Not Null" or whatever the developer felt like typing
342 * that day.  Handling corrupt data is a PERFECT place to be pedantic.
343 */
344static int ascii_isspace(char c){
345  /* From fts3_expr.c */
346  return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
347}
348static int ascii_isalnum(int x){
349  /* From fts3_tokenizer1.c */
350  return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
351}
352static int ascii_tolower(int x){
353  /* From fts3_tokenizer1.c */
354  return (x>='A' && x<='Z') ? x-'A'+'a' : x;
355}
356/* TODO(shess): Consider sqlite3_strnicmp() */
357static int ascii_strncasecmp(const char *s1, const char *s2, size_t n){
358  const unsigned char *us1 = (const unsigned char *)s1;
359  const unsigned char *us2 = (const unsigned char *)s2;
360  while( *us1 && *us2 && n && ascii_tolower(*us1)==ascii_tolower(*us2) ){
361    us1++, us2++, n--;
362  }
363  return n ? ascii_tolower(*us1)-ascii_tolower(*us2) : 0;
364}
365static int ascii_strcasecmp(const char *s1, const char *s2){
366  /* If s2 is equal through strlen(s1), will exit while() due to s1's
367   * trailing NUL, and return NUL-s2[strlen(s1)].
368   */
369  return ascii_strncasecmp(s1, s2, strlen(s1)+1);
370}
371
372/* For some reason I kept making mistakes with offset calculations. */
373static const unsigned char *PageData(DbPage *pPage, unsigned iOffset){
374  assert( iOffset<=pPage->nPageSize );
375  return (unsigned char *)pPage->pData + iOffset;
376}
377
378/* The first page in the file contains a file header in the first 100
379 * bytes.  The page's header information comes after that.  Note that
380 * the offsets in the page's header information are relative to the
381 * beginning of the page, NOT the end of the page header.
382 */
383static const unsigned char *PageHeader(DbPage *pPage){
384  if( pPage->pgno==1 ){
385    const unsigned nDatabaseHeader = 100;
386    return PageData(pPage, nDatabaseHeader);
387  }else{
388    return PageData(pPage, 0);
389  }
390}
391
392/* Helper to fetch the pager and page size for the named database. */
393static int GetPager(sqlite3 *db, const char *zName,
394                    Pager **pPager, unsigned *pnPageSize){
395  Btree *pBt = NULL;
396  int i;
397  for( i=0; i<db->nDb; ++i ){
398    if( ascii_strcasecmp(db->aDb[i].zName, zName)==0 ){
399      pBt = db->aDb[i].pBt;
400      break;
401    }
402  }
403  if( !pBt ){
404    return SQLITE_ERROR;
405  }
406
407  *pPager = sqlite3BtreePager(pBt);
408  *pnPageSize = sqlite3BtreeGetPageSize(pBt) - sqlite3BtreeGetReserve(pBt);
409  return SQLITE_OK;
410}
411
412/* iSerialType is a type read from a record header.  See "2.1 Record Format".
413 */
414
415/* Storage size of iSerialType in bytes.  My interpretation of SQLite
416 * documentation is that text and blob fields can have 32-bit length.
417 * Values past 2^31-12 will need more than 32 bits to encode, which is
418 * why iSerialType is u64.
419 */
420static u32 SerialTypeLength(u64 iSerialType){
421  switch( iSerialType ){
422    case 0 : return 0;  /* NULL */
423    case 1 : return 1;  /* Various integers. */
424    case 2 : return 2;
425    case 3 : return 3;
426    case 4 : return 4;
427    case 5 : return 6;
428    case 6 : return 8;
429    case 7 : return 8;  /* 64-bit float. */
430    case 8 : return 0;  /* Constant 0. */
431    case 9 : return 0;  /* Constant 1. */
432    case 10 : case 11 : assert( !"RESERVED TYPE"); return 0;
433  }
434  return (u32)((iSerialType>>1) - 6);
435}
436
437/* True if iSerialType refers to a blob. */
438static int SerialTypeIsBlob(u64 iSerialType){
439  assert( iSerialType>=12 );
440  return (iSerialType%2)==0;
441}
442
443/* Returns true if the serialized type represented by iSerialType is
444 * compatible with the given type mask.
445 */
446static int SerialTypeIsCompatible(u64 iSerialType, unsigned char mask){
447  switch( iSerialType ){
448    case 0  : return (mask&MASK_NULL)!=0;
449    case 1  : return (mask&MASK_INTEGER)!=0;
450    case 2  : return (mask&MASK_INTEGER)!=0;
451    case 3  : return (mask&MASK_INTEGER)!=0;
452    case 4  : return (mask&MASK_INTEGER)!=0;
453    case 5  : return (mask&MASK_INTEGER)!=0;
454    case 6  : return (mask&MASK_INTEGER)!=0;
455    case 7  : return (mask&MASK_FLOAT)!=0;
456    case 8  : return (mask&MASK_INTEGER)!=0;
457    case 9  : return (mask&MASK_INTEGER)!=0;
458    case 10 : assert( !"RESERVED TYPE"); return 0;
459    case 11 : assert( !"RESERVED TYPE"); return 0;
460  }
461  return (mask&(SerialTypeIsBlob(iSerialType) ? MASK_BLOB : MASK_TEXT));
462}
463
464/* Versions of strdup() with return values appropriate for
465 * sqlite3_free().  malloc.c has sqlite3DbStrDup()/NDup(), but those
466 * need sqlite3DbFree(), which seems intrusive.
467 */
468static char *sqlite3_strndup(const char *z, unsigned n){
469  char *zNew;
470
471  if( z==NULL ){
472    return NULL;
473  }
474
475  zNew = sqlite3_malloc(n+1);
476  if( zNew!=NULL ){
477    memcpy(zNew, z, n);
478    zNew[n] = '\0';
479  }
480  return zNew;
481}
482static char *sqlite3_strdup(const char *z){
483  if( z==NULL ){
484    return NULL;
485  }
486  return sqlite3_strndup(z, strlen(z));
487}
488
489/* Fetch the page number of zTable in zDb from sqlite_master in zDb,
490 * and put it in *piRootPage.
491 */
492static int getRootPage(sqlite3 *db, const char *zDb, const char *zTable,
493                       u32 *piRootPage){
494  char *zSql;  /* SQL selecting root page of named element. */
495  sqlite3_stmt *pStmt;
496  int rc;
497
498  if( strcmp(zTable, "sqlite_master")==0 ){
499    *piRootPage = 1;
500    return SQLITE_OK;
501  }
502
503  zSql = sqlite3_mprintf("SELECT rootpage FROM %s.sqlite_master "
504                         "WHERE type = 'table' AND tbl_name = %Q",
505                         zDb, zTable);
506  if( !zSql ){
507    return SQLITE_NOMEM;
508  }
509
510  rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
511  sqlite3_free(zSql);
512  if( rc!=SQLITE_OK ){
513    return rc;
514  }
515
516  /* Require a result. */
517  rc = sqlite3_step(pStmt);
518  if( rc==SQLITE_DONE ){
519    rc = SQLITE_CORRUPT;
520  }else if( rc==SQLITE_ROW ){
521    *piRootPage = sqlite3_column_int(pStmt, 0);
522
523    /* Require only one result. */
524    rc = sqlite3_step(pStmt);
525    if( rc==SQLITE_DONE ){
526      rc = SQLITE_OK;
527    }else if( rc==SQLITE_ROW ){
528      rc = SQLITE_CORRUPT;
529    }
530  }
531  sqlite3_finalize(pStmt);
532  return rc;
533}
534
535static int getEncoding(sqlite3 *db, const char *zDb, int* piEncoding){
536  sqlite3_stmt *pStmt;
537  int rc;
538  char *zSql = sqlite3_mprintf("PRAGMA %s.encoding", zDb);
539  if( !zSql ){
540    return SQLITE_NOMEM;
541  }
542
543  rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
544  sqlite3_free(zSql);
545  if( rc!=SQLITE_OK ){
546    return rc;
547  }
548
549  /* Require a result. */
550  rc = sqlite3_step(pStmt);
551  if( rc==SQLITE_DONE ){
552    /* This case should not be possible. */
553    rc = SQLITE_CORRUPT;
554  }else if( rc==SQLITE_ROW ){
555    if( sqlite3_column_type(pStmt, 0)==SQLITE_TEXT ){
556      const char* z = (const char *)sqlite3_column_text(pStmt, 0);
557      /* These strings match the literals in pragma.c. */
558      if( !strcmp(z, "UTF-16le") ){
559        *piEncoding = SQLITE_UTF16LE;
560      }else if( !strcmp(z, "UTF-16be") ){
561        *piEncoding = SQLITE_UTF16BE;
562      }else if( !strcmp(z, "UTF-8") ){
563        *piEncoding = SQLITE_UTF8;
564      }else{
565        /* This case should not be possible. */
566        *piEncoding = SQLITE_UTF8;
567      }
568    }else{
569      /* This case should not be possible. */
570      *piEncoding = SQLITE_UTF8;
571    }
572
573    /* Require only one result. */
574    rc = sqlite3_step(pStmt);
575    if( rc==SQLITE_DONE ){
576      rc = SQLITE_OK;
577    }else if( rc==SQLITE_ROW ){
578      /* This case should not be possible. */
579      rc = SQLITE_CORRUPT;
580    }
581  }
582  sqlite3_finalize(pStmt);
583  return rc;
584}
585
586/* Cursor for iterating interior nodes.  Interior page cells contain a
587 * child page number and a rowid.  The child page contains items left
588 * of the rowid (less than).  The rightmost page of the subtree is
589 * stored in the page header.
590 *
591 * interiorCursorDestroy - release all resources associated with the
592 *                         cursor and any parent cursors.
593 * interiorCursorCreate - create a cursor with the given parent and page.
594 * interiorCursorEOF - returns true if neither the cursor nor the
595 *                     parent cursors can return any more data.
596 * interiorCursorNextPage - fetch the next child page from the cursor.
597 *
598 * Logically, interiorCursorNextPage() returns the next child page
599 * number from the page the cursor is currently reading, calling the
600 * parent cursor as necessary to get new pages to read, until done.
601 * SQLITE_ROW if a page is returned, SQLITE_DONE if out of pages,
602 * error otherwise.  Unfortunately, if the table is corrupted
603 * unexpected pages can be returned.  If any unexpected page is found,
604 * leaf or otherwise, it is returned to the caller for processing,
605 * with the interior cursor left empty.  The next call to
606 * interiorCursorNextPage() will recurse to the parent cursor until an
607 * interior page to iterate is returned.
608 *
609 * Note that while interiorCursorNextPage() will refuse to follow
610 * loops, it does not keep track of pages returned for purposes of
611 * preventing duplication.
612 *
613 * Note that interiorCursorEOF() could return false (not at EOF), and
614 * interiorCursorNextPage() could still return SQLITE_DONE.  This
615 * could happen if there are more cells to iterate in an interior
616 * page, but those cells refer to invalid pages.
617 */
618typedef struct RecoverInteriorCursor RecoverInteriorCursor;
619struct RecoverInteriorCursor {
620  RecoverInteriorCursor *pParent; /* Parent node to this node. */
621  DbPage *pPage;                  /* Reference to leaf page. */
622  unsigned nPageSize;             /* Size of page. */
623  unsigned nChildren;             /* Number of children on the page. */
624  unsigned iChild;                /* Index of next child to return. */
625};
626
627static void interiorCursorDestroy(RecoverInteriorCursor *pCursor){
628  /* Destroy all the cursors to the root. */
629  while( pCursor ){
630    RecoverInteriorCursor *p = pCursor;
631    pCursor = pCursor->pParent;
632
633    if( p->pPage ){
634      sqlite3PagerUnref(p->pPage);
635      p->pPage = NULL;
636    }
637
638    memset(p, 0xA5, sizeof(*p));
639    sqlite3_free(p);
640  }
641}
642
643/* Internal helper.  Reset storage in preparation for iterating pPage. */
644static void interiorCursorSetPage(RecoverInteriorCursor *pCursor,
645                                  DbPage *pPage){
646  assert( PageHeader(pPage)[kiPageTypeOffset]==kTableInteriorPage );
647
648  if( pCursor->pPage ){
649    sqlite3PagerUnref(pCursor->pPage);
650    pCursor->pPage = NULL;
651  }
652  pCursor->pPage = pPage;
653  pCursor->iChild = 0;
654
655  /* A child for each cell, plus one in the header. */
656  /* TODO(shess): Sanity-check the count?  Page header plus per-cell
657   * cost of 16-bit offset, 32-bit page number, and one varint
658   * (minimum 1 byte).
659   */
660  pCursor->nChildren = decodeUnsigned16(PageHeader(pPage) +
661                                        kiPageCellCountOffset) + 1;
662}
663
664static int interiorCursorCreate(RecoverInteriorCursor *pParent,
665                                DbPage *pPage, int nPageSize,
666                                RecoverInteriorCursor **ppCursor){
667  RecoverInteriorCursor *pCursor =
668    sqlite3_malloc(sizeof(RecoverInteriorCursor));
669  if( !pCursor ){
670    return SQLITE_NOMEM;
671  }
672
673  memset(pCursor, 0, sizeof(*pCursor));
674  pCursor->pParent = pParent;
675  pCursor->nPageSize = nPageSize;
676  interiorCursorSetPage(pCursor, pPage);
677  *ppCursor = pCursor;
678  return SQLITE_OK;
679}
680
681/* Internal helper.  Return the child page number at iChild. */
682static unsigned interiorCursorChildPage(RecoverInteriorCursor *pCursor){
683  const unsigned char *pPageHeader;  /* Header of the current page. */
684  const unsigned char *pCellOffsets; /* Offset to page's cell offsets. */
685  unsigned iCellOffset;              /* Offset of target cell. */
686
687  assert( pCursor->iChild<pCursor->nChildren );
688
689  /* Rightmost child is in the header. */
690  pPageHeader = PageHeader(pCursor->pPage);
691  if( pCursor->iChild==pCursor->nChildren-1 ){
692    return decodeUnsigned32(pPageHeader + kiPageRightChildOffset);
693  }
694
695  /* Each cell is a 4-byte integer page number and a varint rowid
696   * which is greater than the rowid of items in that sub-tree (this
697   * module ignores ordering). The offset is from the beginning of the
698   * page, not from the page header.
699   */
700  pCellOffsets = pPageHeader + kiPageInteriorHeaderBytes;
701  iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iChild*2);
702  if( iCellOffset<=pCursor->nPageSize-4 ){
703    return decodeUnsigned32(PageData(pCursor->pPage, iCellOffset));
704  }
705
706  /* TODO(shess): Check for cell overlaps?  Cells require 4 bytes plus
707   * a varint.  Check could be identical to leaf check (or even a
708   * shared helper testing for "Cells starting in this range"?).
709   */
710
711  /* If the offset is broken, return an invalid page number. */
712  return 0;
713}
714
715static int interiorCursorEOF(RecoverInteriorCursor *pCursor){
716  /* Find a parent with remaining children.  EOF if none found. */
717  while( pCursor && pCursor->iChild>=pCursor->nChildren ){
718    pCursor = pCursor->pParent;
719  }
720  return pCursor==NULL;
721}
722
723/* Internal helper.  Used to detect if iPage would cause a loop. */
724static int interiorCursorPageInUse(RecoverInteriorCursor *pCursor,
725                                   unsigned iPage){
726  /* Find any parent using the indicated page. */
727  while( pCursor && pCursor->pPage->pgno!=iPage ){
728    pCursor = pCursor->pParent;
729  }
730  return pCursor!=NULL;
731}
732
733/* Get the next page from the interior cursor at *ppCursor.  Returns
734 * SQLITE_ROW with the page in *ppPage, or SQLITE_DONE if out of
735 * pages, or the error SQLite returned.
736 *
737 * If the tree is uneven, then when the cursor attempts to get a new
738 * interior page from the parent cursor, it may get a non-interior
739 * page.  In that case, the new page is returned, and *ppCursor is
740 * updated to point to the parent cursor (this cursor is freed).
741 */
742/* TODO(shess): I've tried to avoid recursion in most of this code,
743 * but this case is more challenging because the recursive call is in
744 * the middle of operation.  One option for converting it without
745 * adding memory management would be to retain the head pointer and
746 * use a helper to "back up" as needed.  Another option would be to
747 * reverse the list during traversal.
748 */
749static int interiorCursorNextPage(RecoverInteriorCursor **ppCursor,
750                                  DbPage **ppPage){
751  RecoverInteriorCursor *pCursor = *ppCursor;
752  while( 1 ){
753    int rc;
754    const unsigned char *pPageHeader;  /* Header of found page. */
755
756    /* Find a valid child page which isn't on the stack. */
757    while( pCursor->iChild<pCursor->nChildren ){
758      const unsigned iPage = interiorCursorChildPage(pCursor);
759      pCursor->iChild++;
760      if( interiorCursorPageInUse(pCursor, iPage) ){
761        fprintf(stderr, "Loop detected at %d\n", iPage);
762      }else{
763        int rc = sqlite3PagerAcquire(pCursor->pPage->pPager, iPage, ppPage, 0);
764        if( rc==SQLITE_OK ){
765          return SQLITE_ROW;
766        }
767      }
768    }
769
770    /* This page has no more children.  Get next page from parent. */
771    if( !pCursor->pParent ){
772      return SQLITE_DONE;
773    }
774    rc = interiorCursorNextPage(&pCursor->pParent, ppPage);
775    if( rc!=SQLITE_ROW ){
776      return rc;
777    }
778
779    /* If a non-interior page is received, that either means that the
780     * tree is uneven, or that a child was re-used (say as an overflow
781     * page).  Remove this cursor and let the caller handle the page.
782     */
783    pPageHeader = PageHeader(*ppPage);
784    if( pPageHeader[kiPageTypeOffset]!=kTableInteriorPage ){
785      *ppCursor = pCursor->pParent;
786      pCursor->pParent = NULL;
787      interiorCursorDestroy(pCursor);
788      return SQLITE_ROW;
789    }
790
791    /* Iterate the new page. */
792    interiorCursorSetPage(pCursor, *ppPage);
793    *ppPage = NULL;
794  }
795
796  assert(NULL);  /* NOTREACHED() */
797  return SQLITE_CORRUPT;
798}
799
800/* Large rows are spilled to overflow pages.  The row's main page
801 * stores the overflow page number after the local payload, with a
802 * linked list forward from there as necessary.  overflowMaybeCreate()
803 * and overflowGetSegment() provide an abstraction for accessing such
804 * data while centralizing the code.
805 *
806 * overflowDestroy - releases all resources associated with the structure.
807 * overflowMaybeCreate - create the overflow structure if it is needed
808 *                       to represent the given record.  See function comment.
809 * overflowGetSegment - fetch a segment from the record, accounting
810 *                      for overflow pages.  Segments which are not
811 *                      entirely contained with a page are constructed
812 *                      into a buffer which is returned.  See function comment.
813 */
814typedef struct RecoverOverflow RecoverOverflow;
815struct RecoverOverflow {
816  RecoverOverflow *pNextOverflow;
817  DbPage *pPage;
818  unsigned nPageSize;
819};
820
821static void overflowDestroy(RecoverOverflow *pOverflow){
822  while( pOverflow ){
823    RecoverOverflow *p = pOverflow;
824    pOverflow = p->pNextOverflow;
825
826    if( p->pPage ){
827      sqlite3PagerUnref(p->pPage);
828      p->pPage = NULL;
829    }
830
831    memset(p, 0xA5, sizeof(*p));
832    sqlite3_free(p);
833  }
834}
835
836/* Internal helper.  Used to detect if iPage would cause a loop. */
837static int overflowPageInUse(RecoverOverflow *pOverflow, unsigned iPage){
838  while( pOverflow && pOverflow->pPage->pgno!=iPage ){
839    pOverflow = pOverflow->pNextOverflow;
840  }
841  return pOverflow!=NULL;
842}
843
844/* Setup to access an nRecordBytes record beginning at iRecordOffset
845 * in pPage.  If nRecordBytes can be satisfied entirely from pPage,
846 * then no overflow pages are needed an *pnLocalRecordBytes is set to
847 * nRecordBytes.  Otherwise, *ppOverflow is set to the head of a list
848 * of overflow pages, and *pnLocalRecordBytes is set to the number of
849 * bytes local to pPage.
850 *
851 * overflowGetSegment() will do the right thing regardless of whether
852 * those values are set to be in-page or not.
853 */
854static int overflowMaybeCreate(DbPage *pPage, unsigned nPageSize,
855                               unsigned iRecordOffset, unsigned nRecordBytes,
856                               unsigned *pnLocalRecordBytes,
857                               RecoverOverflow **ppOverflow){
858  unsigned nLocalRecordBytes;  /* Record bytes in the leaf page. */
859  unsigned iNextPage;          /* Next page number for record data. */
860  unsigned nBytes;             /* Maximum record bytes as of current page. */
861  int rc;
862  RecoverOverflow *pFirstOverflow;  /* First in linked list of pages. */
863  RecoverOverflow *pLastOverflow;   /* End of linked list. */
864
865  /* Calculations from the "Table B-Tree Leaf Cell" part of section
866   * 1.5 of http://www.sqlite.org/fileformat2.html .  maxLocal and
867   * minLocal to match naming in btree.c.
868   */
869  const unsigned maxLocal = nPageSize - 35;
870  const unsigned minLocal = ((nPageSize-12)*32/255)-23;  /* m */
871
872  /* Always fit anything smaller than maxLocal. */
873  if( nRecordBytes<=maxLocal ){
874    *pnLocalRecordBytes = nRecordBytes;
875    *ppOverflow = NULL;
876    return SQLITE_OK;
877  }
878
879  /* Calculate the remainder after accounting for minLocal on the leaf
880   * page and what packs evenly into overflow pages.  If the remainder
881   * does not fit into maxLocal, then a partially-full overflow page
882   * will be required in any case, so store as little as possible locally.
883   */
884  nLocalRecordBytes = minLocal+((nRecordBytes-minLocal)%(nPageSize-4));
885  if( maxLocal<nLocalRecordBytes ){
886    nLocalRecordBytes = minLocal;
887  }
888
889  /* Don't read off the end of the page. */
890  if( iRecordOffset+nLocalRecordBytes+4>nPageSize ){
891    return SQLITE_CORRUPT;
892  }
893
894  /* First overflow page number is after the local bytes. */
895  iNextPage =
896      decodeUnsigned32(PageData(pPage, iRecordOffset + nLocalRecordBytes));
897  nBytes = nLocalRecordBytes;
898
899  /* While there are more pages to read, and more bytes are needed,
900   * get another page.
901   */
902  pFirstOverflow = pLastOverflow = NULL;
903  rc = SQLITE_OK;
904  while( iNextPage && nBytes<nRecordBytes ){
905    RecoverOverflow *pOverflow;  /* New overflow page for the list. */
906
907    rc = sqlite3PagerAcquire(pPage->pPager, iNextPage, &pPage, 0);
908    if( rc!=SQLITE_OK ){
909      break;
910    }
911
912    pOverflow = sqlite3_malloc(sizeof(RecoverOverflow));
913    if( !pOverflow ){
914      sqlite3PagerUnref(pPage);
915      rc = SQLITE_NOMEM;
916      break;
917    }
918    memset(pOverflow, 0, sizeof(*pOverflow));
919    pOverflow->pPage = pPage;
920    pOverflow->nPageSize = nPageSize;
921
922    if( !pFirstOverflow ){
923      pFirstOverflow = pOverflow;
924    }else{
925      pLastOverflow->pNextOverflow = pOverflow;
926    }
927    pLastOverflow = pOverflow;
928
929    iNextPage = decodeUnsigned32(pPage->pData);
930    nBytes += nPageSize-4;
931
932    /* Avoid loops. */
933    if( overflowPageInUse(pFirstOverflow, iNextPage) ){
934      fprintf(stderr, "Overflow loop detected at %d\n", iNextPage);
935      rc = SQLITE_CORRUPT;
936      break;
937    }
938  }
939
940  /* If there were not enough pages, or too many, things are corrupt.
941   * Not having enough pages is an obvious problem, all the data
942   * cannot be read.  Too many pages means that the contents of the
943   * row between the main page and the overflow page(s) is
944   * inconsistent (most likely one or more of the overflow pages does
945   * not really belong to this row).
946   */
947  if( rc==SQLITE_OK && (nBytes<nRecordBytes || iNextPage) ){
948    rc = SQLITE_CORRUPT;
949  }
950
951  if( rc==SQLITE_OK ){
952    *ppOverflow = pFirstOverflow;
953    *pnLocalRecordBytes = nLocalRecordBytes;
954  }else if( pFirstOverflow ){
955    overflowDestroy(pFirstOverflow);
956  }
957  return rc;
958}
959
960/* Use in concert with overflowMaybeCreate() to efficiently read parts
961 * of a potentially-overflowing record.  pPage and iRecordOffset are
962 * the values passed into overflowMaybeCreate(), nLocalRecordBytes and
963 * pOverflow are the values returned by that call.
964 *
965 * On SQLITE_OK, *ppBase points to nRequestBytes of data at
966 * iRequestOffset within the record.  If the data exists contiguously
967 * in a page, a direct pointer is returned, otherwise a buffer from
968 * sqlite3_malloc() is returned with the data.  *pbFree is set true if
969 * sqlite3_free() should be called on *ppBase.
970 */
971/* Operation of this function is subtle.  At any time, pPage is the
972 * current page, with iRecordOffset and nLocalRecordBytes being record
973 * data within pPage, and pOverflow being the overflow page after
974 * pPage.  This allows the code to handle both the initial leaf page
975 * and overflow pages consistently by adjusting the values
976 * appropriately.
977 */
978static int overflowGetSegment(DbPage *pPage, unsigned iRecordOffset,
979                              unsigned nLocalRecordBytes,
980                              RecoverOverflow *pOverflow,
981                              unsigned iRequestOffset, unsigned nRequestBytes,
982                              unsigned char **ppBase, int *pbFree){
983  unsigned nBase;         /* Amount of data currently collected. */
984  unsigned char *pBase;   /* Buffer to collect record data into. */
985
986  /* Skip to the page containing the start of the data. */
987  while( iRequestOffset>=nLocalRecordBytes && pOverflow ){
988    /* Factor out current page's contribution. */
989    iRequestOffset -= nLocalRecordBytes;
990
991    /* Move forward to the next page in the list. */
992    pPage = pOverflow->pPage;
993    iRecordOffset = 4;
994    nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
995    pOverflow = pOverflow->pNextOverflow;
996  }
997
998  /* If the requested data is entirely within this page, return a
999   * pointer into the page.
1000   */
1001  if( iRequestOffset+nRequestBytes<=nLocalRecordBytes ){
1002    /* TODO(shess): "assignment discards qualifiers from pointer target type"
1003     * Having ppBase be const makes sense, but sqlite3_free() takes non-const.
1004     */
1005    *ppBase = (unsigned char *)PageData(pPage, iRecordOffset + iRequestOffset);
1006    *pbFree = 0;
1007    return SQLITE_OK;
1008  }
1009
1010  /* The data range would require additional pages. */
1011  if( !pOverflow ){
1012    /* Should never happen, the range is outside the nRecordBytes
1013     * passed to overflowMaybeCreate().
1014     */
1015    assert(NULL);  /* NOTREACHED */
1016    return SQLITE_ERROR;
1017  }
1018
1019  /* Get a buffer to construct into. */
1020  nBase = 0;
1021  pBase = sqlite3_malloc(nRequestBytes);
1022  if( !pBase ){
1023    return SQLITE_NOMEM;
1024  }
1025  while( nBase<nRequestBytes ){
1026    /* Copy over data present on this page. */
1027    unsigned nCopyBytes = nRequestBytes - nBase;
1028    if( nLocalRecordBytes-iRequestOffset<nCopyBytes ){
1029      nCopyBytes = nLocalRecordBytes - iRequestOffset;
1030    }
1031    memcpy(pBase + nBase, PageData(pPage, iRecordOffset + iRequestOffset),
1032           nCopyBytes);
1033    nBase += nCopyBytes;
1034
1035    if( pOverflow ){
1036      /* Copy from start of record data in future pages. */
1037      iRequestOffset = 0;
1038
1039      /* Move forward to the next page in the list.  Should match
1040       * first while() loop.
1041       */
1042      pPage = pOverflow->pPage;
1043      iRecordOffset = 4;
1044      nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
1045      pOverflow = pOverflow->pNextOverflow;
1046    }else if( nBase<nRequestBytes ){
1047      /* Ran out of overflow pages with data left to deliver.  Not
1048       * possible if the requested range fits within nRecordBytes
1049       * passed to overflowMaybeCreate() when creating pOverflow.
1050       */
1051      assert(NULL);  /* NOTREACHED */
1052      sqlite3_free(pBase);
1053      return SQLITE_ERROR;
1054    }
1055  }
1056  assert( nBase==nRequestBytes );
1057  *ppBase = pBase;
1058  *pbFree = 1;
1059  return SQLITE_OK;
1060}
1061
1062/* Primary structure for iterating the contents of a table.
1063 *
1064 * leafCursorDestroy - release all resources associated with the cursor.
1065 * leafCursorCreate - create a cursor to iterate items from tree at
1066 *                    the provided root page.
1067 * leafCursorNextValidCell - get the cursor ready to access data from
1068 *                           the next valid cell in the table.
1069 * leafCursorCellRowid - get the current cell's rowid.
1070 * leafCursorCellColumns - get current cell's column count.
1071 * leafCursorCellColInfo - get type and data for a column in current cell.
1072 *
1073 * leafCursorNextValidCell skips cells which fail simple integrity
1074 * checks, such as overlapping other cells, or being located at
1075 * impossible offsets, or where header data doesn't correctly describe
1076 * payload data.  Returns SQLITE_ROW if a valid cell is found,
1077 * SQLITE_DONE if all pages in the tree were exhausted.
1078 *
1079 * leafCursorCellColInfo() accounts for overflow pages in the style of
1080 * overflowGetSegment().
1081 */
1082typedef struct RecoverLeafCursor RecoverLeafCursor;
1083struct RecoverLeafCursor {
1084  RecoverInteriorCursor *pParent;  /* Parent node to this node. */
1085  DbPage *pPage;                   /* Reference to leaf page. */
1086  unsigned nPageSize;              /* Size of pPage. */
1087  unsigned nCells;                 /* Number of cells in pPage. */
1088  unsigned iCell;                  /* Current cell. */
1089
1090  /* Info parsed from data in iCell. */
1091  i64 iRowid;                      /* rowid parsed. */
1092  unsigned nRecordCols;            /* how many items in the record. */
1093  u64 iRecordOffset;               /* offset to record data. */
1094  /* TODO(shess): nRecordBytes and nRecordHeaderBytes are used in
1095   * leafCursorCellColInfo() to prevent buffer overruns.
1096   * leafCursorCellDecode() already verified that the cell is valid, so
1097   * those checks should be redundant.
1098   */
1099  u64 nRecordBytes;                /* Size of record data. */
1100  unsigned nLocalRecordBytes;      /* Amount of record data in-page. */
1101  unsigned nRecordHeaderBytes;     /* Size of record header data. */
1102  unsigned char *pRecordHeader;    /* Pointer to record header data. */
1103  int bFreeRecordHeader;           /* True if record header requires free. */
1104  RecoverOverflow *pOverflow;      /* Cell overflow info, if needed. */
1105};
1106
1107/* Internal helper shared between next-page and create-cursor.  If
1108 * pPage is a leaf page, it will be stored in the cursor and state
1109 * initialized for reading cells.
1110 *
1111 * If pPage is an interior page, a new parent cursor is created and
1112 * injected on the stack.  This is necessary to handle trees with
1113 * uneven depth, but also is used during initial setup.
1114 *
1115 * If pPage is not a table page at all, it is discarded.
1116 *
1117 * If SQLITE_OK is returned, the caller no longer owns pPage,
1118 * otherwise the caller is responsible for discarding it.
1119 */
1120static int leafCursorLoadPage(RecoverLeafCursor *pCursor, DbPage *pPage){
1121  const unsigned char *pPageHeader;  /* Header of *pPage */
1122
1123  /* Release the current page. */
1124  if( pCursor->pPage ){
1125    sqlite3PagerUnref(pCursor->pPage);
1126    pCursor->pPage = NULL;
1127    pCursor->iCell = pCursor->nCells = 0;
1128  }
1129
1130  /* If the page is an unexpected interior node, inject a new stack
1131   * layer and try again from there.
1132   */
1133  pPageHeader = PageHeader(pPage);
1134  if( pPageHeader[kiPageTypeOffset]==kTableInteriorPage ){
1135    RecoverInteriorCursor *pParent;
1136    int rc = interiorCursorCreate(pCursor->pParent, pPage, pCursor->nPageSize,
1137                                  &pParent);
1138    if( rc!=SQLITE_OK ){
1139      return rc;
1140    }
1141    pCursor->pParent = pParent;
1142    return SQLITE_OK;
1143  }
1144
1145  /* Not a leaf page, skip it. */
1146  if( pPageHeader[kiPageTypeOffset]!=kTableLeafPage ){
1147    sqlite3PagerUnref(pPage);
1148    return SQLITE_OK;
1149  }
1150
1151  /* Take ownership of the page and start decoding. */
1152  pCursor->pPage = pPage;
1153  pCursor->iCell = 0;
1154  pCursor->nCells = decodeUnsigned16(pPageHeader + kiPageCellCountOffset);
1155  return SQLITE_OK;
1156}
1157
1158/* Get the next leaf-level page in the tree.  Returns SQLITE_ROW when
1159 * a leaf page is found, SQLITE_DONE when no more leaves exist, or any
1160 * error which occurred.
1161 */
1162static int leafCursorNextPage(RecoverLeafCursor *pCursor){
1163  if( !pCursor->pParent ){
1164    return SQLITE_DONE;
1165  }
1166
1167  /* Repeatedly load the parent's next child page until a leaf is found. */
1168  do {
1169    DbPage *pNextPage;
1170    int rc = interiorCursorNextPage(&pCursor->pParent, &pNextPage);
1171    if( rc!=SQLITE_ROW ){
1172      assert( rc==SQLITE_DONE );
1173      return rc;
1174    }
1175
1176    rc = leafCursorLoadPage(pCursor, pNextPage);
1177    if( rc!=SQLITE_OK ){
1178      sqlite3PagerUnref(pNextPage);
1179      return rc;
1180    }
1181  } while( !pCursor->pPage );
1182
1183  return SQLITE_ROW;
1184}
1185
1186static void leafCursorDestroyCellData(RecoverLeafCursor *pCursor){
1187  if( pCursor->bFreeRecordHeader ){
1188    sqlite3_free(pCursor->pRecordHeader);
1189  }
1190  pCursor->bFreeRecordHeader = 0;
1191  pCursor->pRecordHeader = NULL;
1192
1193  if( pCursor->pOverflow ){
1194    overflowDestroy(pCursor->pOverflow);
1195    pCursor->pOverflow = NULL;
1196  }
1197}
1198
1199static void leafCursorDestroy(RecoverLeafCursor *pCursor){
1200  leafCursorDestroyCellData(pCursor);
1201
1202  if( pCursor->pParent ){
1203    interiorCursorDestroy(pCursor->pParent);
1204    pCursor->pParent = NULL;
1205  }
1206
1207  if( pCursor->pPage ){
1208    sqlite3PagerUnref(pCursor->pPage);
1209    pCursor->pPage = NULL;
1210  }
1211
1212  memset(pCursor, 0xA5, sizeof(*pCursor));
1213  sqlite3_free(pCursor);
1214}
1215
1216/* Create a cursor to iterate the rows from the leaf pages of a table
1217 * rooted at iRootPage.
1218 */
1219/* TODO(shess): recoverOpen() calls this to setup the cursor, and I
1220 * think that recoverFilter() may make a hard assumption that the
1221 * cursor returned will turn up at least one valid cell.
1222 *
1223 * The cases I can think of which break this assumption are:
1224 * - pPage is a valid leaf page with no valid cells.
1225 * - pPage is a valid interior page with no valid leaves.
1226 * - pPage is a valid interior page who's leaves contain no valid cells.
1227 * - pPage is not a valid leaf or interior page.
1228 */
1229static int leafCursorCreate(Pager *pPager, unsigned nPageSize,
1230                            u32 iRootPage, RecoverLeafCursor **ppCursor){
1231  DbPage *pPage;               /* Reference to page at iRootPage. */
1232  RecoverLeafCursor *pCursor;  /* Leaf cursor being constructed. */
1233  int rc;
1234
1235  /* Start out with the root page. */
1236  rc = sqlite3PagerAcquire(pPager, iRootPage, &pPage, 0);
1237  if( rc!=SQLITE_OK ){
1238    return rc;
1239  }
1240
1241  pCursor = sqlite3_malloc(sizeof(RecoverLeafCursor));
1242  if( !pCursor ){
1243    sqlite3PagerUnref(pPage);
1244    return SQLITE_NOMEM;
1245  }
1246  memset(pCursor, 0, sizeof(*pCursor));
1247
1248  pCursor->nPageSize = nPageSize;
1249
1250  rc = leafCursorLoadPage(pCursor, pPage);
1251  if( rc!=SQLITE_OK ){
1252    sqlite3PagerUnref(pPage);
1253    leafCursorDestroy(pCursor);
1254    return rc;
1255  }
1256
1257  /* pPage wasn't a leaf page, find the next leaf page. */
1258  if( !pCursor->pPage ){
1259    rc = leafCursorNextPage(pCursor);
1260    if( rc!=SQLITE_DONE && rc!=SQLITE_ROW ){
1261      leafCursorDestroy(pCursor);
1262      return rc;
1263    }
1264  }
1265
1266  *ppCursor = pCursor;
1267  return SQLITE_OK;
1268}
1269
1270/* Useful for setting breakpoints. */
1271static int ValidateError(){
1272  return SQLITE_ERROR;
1273}
1274
1275/* Setup the cursor for reading the information from cell iCell. */
1276static int leafCursorCellDecode(RecoverLeafCursor *pCursor){
1277  const unsigned char *pPageHeader;  /* Header of current page. */
1278  const unsigned char *pPageEnd;     /* Byte after end of current page. */
1279  const unsigned char *pCellOffsets; /* Pointer to page's cell offsets. */
1280  unsigned iCellOffset;              /* Offset of current cell (iCell). */
1281  const unsigned char *pCell;        /* Pointer to data at iCellOffset. */
1282  unsigned nCellMaxBytes;            /* Maximum local size of iCell. */
1283  unsigned iEndOffset;               /* End of iCell's in-page data. */
1284  u64 nRecordBytes;                  /* Expected size of cell, w/overflow. */
1285  u64 iRowid;                        /* iCell's rowid (in table). */
1286  unsigned nRead;                    /* Amount of cell read. */
1287  unsigned nRecordHeaderRead;        /* Header data read. */
1288  u64 nRecordHeaderBytes;            /* Header size expected. */
1289  unsigned nRecordCols;              /* Columns read from header. */
1290  u64 nRecordColBytes;               /* Bytes in payload for those columns. */
1291  unsigned i;
1292  int rc;
1293
1294  assert( pCursor->iCell<pCursor->nCells );
1295
1296  leafCursorDestroyCellData(pCursor);
1297
1298  /* Find the offset to the row. */
1299  pPageHeader = PageHeader(pCursor->pPage);
1300  pCellOffsets = pPageHeader + knPageLeafHeaderBytes;
1301  pPageEnd = PageData(pCursor->pPage, pCursor->nPageSize);
1302  if( pCellOffsets + pCursor->iCell*2 + 2 > pPageEnd ){
1303    return ValidateError();
1304  }
1305  iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iCell*2);
1306  if( iCellOffset>=pCursor->nPageSize ){
1307    return ValidateError();
1308  }
1309
1310  pCell = PageData(pCursor->pPage, iCellOffset);
1311  nCellMaxBytes = pCursor->nPageSize - iCellOffset;
1312
1313  /* B-tree leaf cells lead with varint record size, varint rowid and
1314   * varint header size.
1315   */
1316  /* TODO(shess): The smallest page size is 512 bytes, which has an m
1317   * of 39.  Three varints need at most 27 bytes to encode.  I think.
1318   */
1319  if( !checkVarints(pCell, nCellMaxBytes, 3) ){
1320    return ValidateError();
1321  }
1322
1323  nRead = getVarint(pCell, &nRecordBytes);
1324  assert( iCellOffset+nRead<=pCursor->nPageSize );
1325  pCursor->nRecordBytes = nRecordBytes;
1326
1327  nRead += getVarint(pCell + nRead, &iRowid);
1328  assert( iCellOffset+nRead<=pCursor->nPageSize );
1329  pCursor->iRowid = (i64)iRowid;
1330
1331  pCursor->iRecordOffset = iCellOffset + nRead;
1332
1333  /* Start overflow setup here because nLocalRecordBytes is needed to
1334   * check cell overlap.
1335   */
1336  rc = overflowMaybeCreate(pCursor->pPage, pCursor->nPageSize,
1337                           pCursor->iRecordOffset, pCursor->nRecordBytes,
1338                           &pCursor->nLocalRecordBytes,
1339                           &pCursor->pOverflow);
1340  if( rc!=SQLITE_OK ){
1341    return ValidateError();
1342  }
1343
1344  /* Check that no other cell starts within this cell. */
1345  iEndOffset = pCursor->iRecordOffset + pCursor->nLocalRecordBytes;
1346  for( i=0; i<pCursor->nCells && pCellOffsets + i*2 + 2 <= pPageEnd; ++i ){
1347    const unsigned iOtherOffset = decodeUnsigned16(pCellOffsets + i*2);
1348    if( iOtherOffset>iCellOffset && iOtherOffset<iEndOffset ){
1349      return ValidateError();
1350    }
1351  }
1352
1353  nRecordHeaderRead = getVarint(pCell + nRead, &nRecordHeaderBytes);
1354  assert( nRecordHeaderBytes<=nRecordBytes );
1355  pCursor->nRecordHeaderBytes = nRecordHeaderBytes;
1356
1357  /* Large headers could overflow if pages are small. */
1358  rc = overflowGetSegment(pCursor->pPage,
1359                          pCursor->iRecordOffset, pCursor->nLocalRecordBytes,
1360                          pCursor->pOverflow, 0, nRecordHeaderBytes,
1361                          &pCursor->pRecordHeader, &pCursor->bFreeRecordHeader);
1362  if( rc!=SQLITE_OK ){
1363    return ValidateError();
1364  }
1365
1366  /* Tally up the column count and size of data. */
1367  nRecordCols = 0;
1368  nRecordColBytes = 0;
1369  while( nRecordHeaderRead<nRecordHeaderBytes ){
1370    u64 iSerialType;  /* Type descriptor for current column. */
1371    if( !checkVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1372                     nRecordHeaderBytes - nRecordHeaderRead) ){
1373      return ValidateError();
1374    }
1375    nRecordHeaderRead += getVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1376                                   &iSerialType);
1377    if( iSerialType==10 || iSerialType==11 ){
1378      return ValidateError();
1379    }
1380    nRecordColBytes += SerialTypeLength(iSerialType);
1381    nRecordCols++;
1382  }
1383  pCursor->nRecordCols = nRecordCols;
1384
1385  /* Parsing the header used as many bytes as expected. */
1386  if( nRecordHeaderRead!=nRecordHeaderBytes ){
1387    return ValidateError();
1388  }
1389
1390  /* Calculated record is size of expected record. */
1391  if( nRecordHeaderBytes+nRecordColBytes!=nRecordBytes ){
1392    return ValidateError();
1393  }
1394
1395  return SQLITE_OK;
1396}
1397
1398static i64 leafCursorCellRowid(RecoverLeafCursor *pCursor){
1399  return pCursor->iRowid;
1400}
1401
1402static unsigned leafCursorCellColumns(RecoverLeafCursor *pCursor){
1403  return pCursor->nRecordCols;
1404}
1405
1406/* Get the column info for the cell.  Pass NULL for ppBase to prevent
1407 * retrieving the data segment.  If *pbFree is true, *ppBase must be
1408 * freed by the caller using sqlite3_free().
1409 */
1410static int leafCursorCellColInfo(RecoverLeafCursor *pCursor,
1411                                 unsigned iCol, u64 *piColType,
1412                                 unsigned char **ppBase, int *pbFree){
1413  const unsigned char *pRecordHeader;  /* Current cell's header. */
1414  u64 nRecordHeaderBytes;              /* Bytes in pRecordHeader. */
1415  unsigned nRead;                      /* Bytes read from header. */
1416  u64 iColEndOffset;                   /* Offset to end of column in cell. */
1417  unsigned nColsSkipped;               /* Count columns as procesed. */
1418  u64 iSerialType;                     /* Type descriptor for current column. */
1419
1420  /* Implicit NULL for columns past the end.  This case happens when
1421   * rows have not been updated since an ALTER TABLE added columns.
1422   * It is more convenient to address here than in callers.
1423   */
1424  if( iCol>=pCursor->nRecordCols ){
1425    *piColType = 0;
1426    if( ppBase ){
1427      *ppBase = 0;
1428      *pbFree = 0;
1429    }
1430    return SQLITE_OK;
1431  }
1432
1433  /* Must be able to decode header size. */
1434  pRecordHeader = pCursor->pRecordHeader;
1435  if( !checkVarint(pRecordHeader, pCursor->nRecordHeaderBytes) ){
1436    return SQLITE_CORRUPT;
1437  }
1438
1439  /* Rather than caching the header size and how many bytes it took,
1440   * decode it every time.
1441   */
1442  nRead = getVarint(pRecordHeader, &nRecordHeaderBytes);
1443  assert( nRecordHeaderBytes==pCursor->nRecordHeaderBytes );
1444
1445  /* Scan forward to the indicated column.  Scans to _after_ column
1446   * for later range checking.
1447   */
1448  /* TODO(shess): This could get expensive for very wide tables.  An
1449   * array of iSerialType could be built in leafCursorCellDecode(), but
1450   * the number of columns is dynamic per row, so it would add memory
1451   * management complexity.  Enough info to efficiently forward
1452   * iterate could be kept, if all clients forward iterate
1453   * (recoverColumn() may not).
1454   */
1455  iColEndOffset = 0;
1456  nColsSkipped = 0;
1457  while( nColsSkipped<=iCol && nRead<nRecordHeaderBytes ){
1458    if( !checkVarint(pRecordHeader + nRead, nRecordHeaderBytes - nRead) ){
1459      return SQLITE_CORRUPT;
1460    }
1461    nRead += getVarint(pRecordHeader + nRead, &iSerialType);
1462    iColEndOffset += SerialTypeLength(iSerialType);
1463    nColsSkipped++;
1464  }
1465
1466  /* Column's data extends past record's end. */
1467  if( nRecordHeaderBytes+iColEndOffset>pCursor->nRecordBytes ){
1468    return SQLITE_CORRUPT;
1469  }
1470
1471  *piColType = iSerialType;
1472  if( ppBase ){
1473    const u32 nColBytes = SerialTypeLength(iSerialType);
1474
1475    /* Offset from start of record to beginning of column. */
1476    const unsigned iColOffset = nRecordHeaderBytes+iColEndOffset-nColBytes;
1477
1478    return overflowGetSegment(pCursor->pPage, pCursor->iRecordOffset,
1479                              pCursor->nLocalRecordBytes, pCursor->pOverflow,
1480                              iColOffset, nColBytes, ppBase, pbFree);
1481  }
1482  return SQLITE_OK;
1483}
1484
1485static int leafCursorNextValidCell(RecoverLeafCursor *pCursor){
1486  while( 1 ){
1487    int rc;
1488
1489    /* Move to the next cell. */
1490    pCursor->iCell++;
1491
1492    /* No more cells, get the next leaf. */
1493    if( pCursor->iCell>=pCursor->nCells ){
1494      rc = leafCursorNextPage(pCursor);
1495      if( rc!=SQLITE_ROW ){
1496        return rc;
1497      }
1498      assert( pCursor->iCell==0 );
1499    }
1500
1501    /* If the cell is valid, indicate that a row is available. */
1502    rc = leafCursorCellDecode(pCursor);
1503    if( rc==SQLITE_OK ){
1504      return SQLITE_ROW;
1505    }
1506
1507    /* Iterate until done or a valid row is found. */
1508    /* TODO(shess): Remove debugging output. */
1509    fprintf(stderr, "Skipping invalid cell\n");
1510  }
1511  return SQLITE_ERROR;
1512}
1513
1514typedef struct Recover Recover;
1515struct Recover {
1516  sqlite3_vtab base;
1517  sqlite3 *db;                /* Host database connection */
1518  char *zDb;                  /* Database containing target table */
1519  char *zTable;               /* Target table */
1520  unsigned nCols;             /* Number of columns in target table */
1521  unsigned char *pTypes;      /* Types of columns in target table */
1522};
1523
1524/* Internal helper for deleting the module. */
1525static void recoverRelease(Recover *pRecover){
1526  sqlite3_free(pRecover->zDb);
1527  sqlite3_free(pRecover->zTable);
1528  sqlite3_free(pRecover->pTypes);
1529  memset(pRecover, 0xA5, sizeof(*pRecover));
1530  sqlite3_free(pRecover);
1531}
1532
1533/* Helper function for initializing the module.  Forward-declared so
1534 * recoverCreate() and recoverConnect() can see it.
1535 */
1536static int recoverInit(
1537  sqlite3 *, void *, int, const char *const*, sqlite3_vtab **, char **
1538);
1539
1540static int recoverCreate(
1541  sqlite3 *db,
1542  void *pAux,
1543  int argc, const char *const*argv,
1544  sqlite3_vtab **ppVtab,
1545  char **pzErr
1546){
1547  FNENTRY();
1548  return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1549}
1550
1551/* This should never be called. */
1552static int recoverConnect(
1553  sqlite3 *db,
1554  void *pAux,
1555  int argc, const char *const*argv,
1556  sqlite3_vtab **ppVtab,
1557  char **pzErr
1558){
1559  FNENTRY();
1560  return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1561}
1562
1563/* No indices supported. */
1564static int recoverBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
1565  FNENTRY();
1566  return SQLITE_OK;
1567}
1568
1569/* Logically, this should never be called. */
1570static int recoverDisconnect(sqlite3_vtab *pVtab){
1571  FNENTRY();
1572  recoverRelease((Recover*)pVtab);
1573  return SQLITE_OK;
1574}
1575
1576static int recoverDestroy(sqlite3_vtab *pVtab){
1577  FNENTRY();
1578  recoverRelease((Recover*)pVtab);
1579  return SQLITE_OK;
1580}
1581
1582typedef struct RecoverCursor RecoverCursor;
1583struct RecoverCursor {
1584  sqlite3_vtab_cursor base;
1585  RecoverLeafCursor *pLeafCursor;
1586  int iEncoding;
1587  int bEOF;
1588};
1589
1590static int recoverOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
1591  Recover *pRecover = (Recover*)pVTab;
1592  u32 iRootPage;                   /* Root page of the backing table. */
1593  int iEncoding;                   /* UTF encoding for backing database. */
1594  unsigned nPageSize;              /* Size of pages in backing database. */
1595  Pager *pPager;                   /* Backing database pager. */
1596  RecoverLeafCursor *pLeafCursor;  /* Cursor to read table's leaf pages. */
1597  RecoverCursor *pCursor;          /* Cursor to read rows from leaves. */
1598  int rc;
1599
1600  FNENTRY();
1601
1602  iRootPage = 0;
1603  rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable,
1604                   &iRootPage);
1605  if( rc!=SQLITE_OK ){
1606    return rc;
1607  }
1608
1609  iEncoding = 0;
1610  rc = getEncoding(pRecover->db, pRecover->zDb, &iEncoding);
1611  if( rc!=SQLITE_OK ){
1612    return rc;
1613  }
1614
1615  rc = GetPager(pRecover->db, pRecover->zDb, &pPager, &nPageSize);
1616  if( rc!=SQLITE_OK ){
1617    return rc;
1618  }
1619
1620  rc = leafCursorCreate(pPager, nPageSize, iRootPage, &pLeafCursor);
1621  if( rc!=SQLITE_OK ){
1622    return rc;
1623  }
1624
1625  pCursor = sqlite3_malloc(sizeof(RecoverCursor));
1626  if( !pCursor ){
1627    leafCursorDestroy(pLeafCursor);
1628    return SQLITE_NOMEM;
1629  }
1630  memset(pCursor, 0, sizeof(*pCursor));
1631  pCursor->base.pVtab = pVTab;
1632  pCursor->pLeafCursor = pLeafCursor;
1633  pCursor->iEncoding = iEncoding;
1634
1635  /* If no leaf pages were found, empty result set. */
1636  /* TODO(shess): leafCursorNextValidCell() would return SQLITE_ROW or
1637   * SQLITE_DONE to indicate whether there is further data to consider.
1638   */
1639  pCursor->bEOF = (pLeafCursor->pPage==NULL);
1640
1641  *ppCursor = (sqlite3_vtab_cursor*)pCursor;
1642  return SQLITE_OK;
1643}
1644
1645static int recoverClose(sqlite3_vtab_cursor *cur){
1646  RecoverCursor *pCursor = (RecoverCursor*)cur;
1647  FNENTRY();
1648  if( pCursor->pLeafCursor ){
1649    leafCursorDestroy(pCursor->pLeafCursor);
1650    pCursor->pLeafCursor = NULL;
1651  }
1652  memset(pCursor, 0xA5, sizeof(*pCursor));
1653  sqlite3_free(cur);
1654  return SQLITE_OK;
1655}
1656
1657/* Helpful place to set a breakpoint. */
1658static int RecoverInvalidCell(){
1659  return SQLITE_ERROR;
1660}
1661
1662/* Returns SQLITE_OK if the cell has an appropriate number of columns
1663 * with the appropriate types of data.
1664 */
1665static int recoverValidateLeafCell(Recover *pRecover, RecoverCursor *pCursor){
1666  unsigned i;
1667
1668  /* If the row's storage has too many columns, skip it. */
1669  if( leafCursorCellColumns(pCursor->pLeafCursor)>pRecover->nCols ){
1670    return RecoverInvalidCell();
1671  }
1672
1673  /* Skip rows with unexpected types. */
1674  for( i=0; i<pRecover->nCols; ++i ){
1675    u64 iType;  /* Storage type of column i. */
1676    int rc;
1677
1678    /* ROWID alias. */
1679    if( (pRecover->pTypes[i]&MASK_ROWID) ){
1680      continue;
1681    }
1682
1683    rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iType, NULL, NULL);
1684    assert( rc==SQLITE_OK );
1685    if( rc!=SQLITE_OK || !SerialTypeIsCompatible(iType, pRecover->pTypes[i]) ){
1686      return RecoverInvalidCell();
1687    }
1688  }
1689
1690  return SQLITE_OK;
1691}
1692
1693static int recoverNext(sqlite3_vtab_cursor *pVtabCursor){
1694  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1695  Recover *pRecover = (Recover*)pCursor->base.pVtab;
1696  int rc;
1697
1698  FNENTRY();
1699
1700  /* Scan forward to the next cell with valid storage, then check that
1701   * the stored data matches the schema.
1702   */
1703  while( (rc = leafCursorNextValidCell(pCursor->pLeafCursor))==SQLITE_ROW ){
1704    if( recoverValidateLeafCell(pRecover, pCursor)==SQLITE_OK ){
1705      return SQLITE_OK;
1706    }
1707  }
1708
1709  if( rc==SQLITE_DONE ){
1710    pCursor->bEOF = 1;
1711    return SQLITE_OK;
1712  }
1713
1714  assert( rc!=SQLITE_OK );
1715  return rc;
1716}
1717
1718static int recoverFilter(
1719  sqlite3_vtab_cursor *pVtabCursor,
1720  int idxNum, const char *idxStr,
1721  int argc, sqlite3_value **argv
1722){
1723  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1724  Recover *pRecover = (Recover*)pCursor->base.pVtab;
1725  int rc;
1726
1727  FNENTRY();
1728
1729  /* There were no valid leaf pages in the table. */
1730  if( pCursor->bEOF ){
1731    return SQLITE_OK;
1732  }
1733
1734  /* Load the first cell, and iterate forward if it's not valid.  If no cells at
1735   * all are valid, recoverNext() sets bEOF and returns appropriately.
1736   */
1737  rc = leafCursorCellDecode(pCursor->pLeafCursor);
1738  if( rc!=SQLITE_OK || recoverValidateLeafCell(pRecover, pCursor)!=SQLITE_OK ){
1739    return recoverNext(pVtabCursor);
1740  }
1741
1742  return SQLITE_OK;
1743}
1744
1745static int recoverEof(sqlite3_vtab_cursor *pVtabCursor){
1746  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1747  FNENTRY();
1748  return pCursor->bEOF;
1749}
1750
1751static int recoverColumn(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i){
1752  RecoverCursor *pCursor = (RecoverCursor*)cur;
1753  Recover *pRecover = (Recover*)pCursor->base.pVtab;
1754  u64 iColType;             /* Storage type of column i. */
1755  unsigned char *pColData;  /* Column i's data. */
1756  int shouldFree;           /* Non-zero if pColData should be freed. */
1757  int rc;
1758
1759  FNENTRY();
1760
1761  if( i>=pRecover->nCols ){
1762    return SQLITE_ERROR;
1763  }
1764
1765  /* ROWID alias. */
1766  if( (pRecover->pTypes[i]&MASK_ROWID) ){
1767    sqlite3_result_int64(ctx, leafCursorCellRowid(pCursor->pLeafCursor));
1768    return SQLITE_OK;
1769  }
1770
1771  pColData = NULL;
1772  shouldFree = 0;
1773  rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iColType,
1774                             &pColData, &shouldFree);
1775  if( rc!=SQLITE_OK ){
1776    return rc;
1777  }
1778  /* recoverValidateLeafCell() should guarantee that this will never
1779   * occur.
1780   */
1781  if( !SerialTypeIsCompatible(iColType, pRecover->pTypes[i]) ){
1782    if( shouldFree ){
1783      sqlite3_free(pColData);
1784    }
1785    return SQLITE_ERROR;
1786  }
1787
1788  switch( iColType ){
1789    case 0 : sqlite3_result_null(ctx); break;
1790    case 1 : sqlite3_result_int64(ctx, decodeSigned(pColData, 1)); break;
1791    case 2 : sqlite3_result_int64(ctx, decodeSigned(pColData, 2)); break;
1792    case 3 : sqlite3_result_int64(ctx, decodeSigned(pColData, 3)); break;
1793    case 4 : sqlite3_result_int64(ctx, decodeSigned(pColData, 4)); break;
1794    case 5 : sqlite3_result_int64(ctx, decodeSigned(pColData, 6)); break;
1795    case 6 : sqlite3_result_int64(ctx, decodeSigned(pColData, 8)); break;
1796    case 7 : sqlite3_result_double(ctx, decodeFloat64(pColData)); break;
1797    case 8 : sqlite3_result_int(ctx, 0); break;
1798    case 9 : sqlite3_result_int(ctx, 1); break;
1799    case 10 : assert( iColType!=10 ); break;
1800    case 11 : assert( iColType!=11 ); break;
1801
1802    default : {
1803      u32 l = SerialTypeLength(iColType);
1804
1805      /* If pColData was already allocated, arrange to pass ownership. */
1806      sqlite3_destructor_type pFn = SQLITE_TRANSIENT;
1807      if( shouldFree ){
1808        pFn = sqlite3_free;
1809        shouldFree = 0;
1810      }
1811
1812      if( SerialTypeIsBlob(iColType) ){
1813        sqlite3_result_blob(ctx, pColData, l, pFn);
1814      }else{
1815        if( pCursor->iEncoding==SQLITE_UTF16LE ){
1816          sqlite3_result_text16le(ctx, (const void*)pColData, l, pFn);
1817        }else if( pCursor->iEncoding==SQLITE_UTF16BE ){
1818          sqlite3_result_text16be(ctx, (const void*)pColData, l, pFn);
1819        }else{
1820          sqlite3_result_text(ctx, (const char*)pColData, l, pFn);
1821        }
1822      }
1823    } break;
1824  }
1825  if( shouldFree ){
1826    sqlite3_free(pColData);
1827  }
1828  return SQLITE_OK;
1829}
1830
1831static int recoverRowid(sqlite3_vtab_cursor *pVtabCursor, sqlite_int64 *pRowid){
1832  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1833  FNENTRY();
1834  *pRowid = leafCursorCellRowid(pCursor->pLeafCursor);
1835  return SQLITE_OK;
1836}
1837
1838static sqlite3_module recoverModule = {
1839  0,                         /* iVersion */
1840  recoverCreate,             /* xCreate - create a table */
1841  recoverConnect,            /* xConnect - connect to an existing table */
1842  recoverBestIndex,          /* xBestIndex - Determine search strategy */
1843  recoverDisconnect,         /* xDisconnect - Disconnect from a table */
1844  recoverDestroy,            /* xDestroy - Drop a table */
1845  recoverOpen,               /* xOpen - open a cursor */
1846  recoverClose,              /* xClose - close a cursor */
1847  recoverFilter,             /* xFilter - configure scan constraints */
1848  recoverNext,               /* xNext - advance a cursor */
1849  recoverEof,                /* xEof */
1850  recoverColumn,             /* xColumn - read data */
1851  recoverRowid,              /* xRowid - read data */
1852  0,                         /* xUpdate - write data */
1853  0,                         /* xBegin - begin transaction */
1854  0,                         /* xSync - sync transaction */
1855  0,                         /* xCommit - commit transaction */
1856  0,                         /* xRollback - rollback transaction */
1857  0,                         /* xFindFunction - function overloading */
1858  0,                         /* xRename - rename the table */
1859};
1860
1861int recoverVtableInit(sqlite3 *db){
1862  return sqlite3_create_module_v2(db, "recover", &recoverModule, NULL, 0);
1863}
1864
1865/* This section of code is for parsing the create input and
1866 * initializing the module.
1867 */
1868
1869/* Find the next word in zText and place the endpoints in pzWord*.
1870 * Returns true if the word is non-empty.  "Word" is defined as
1871 * ASCII alphanumeric plus '_' at this time.
1872 */
1873static int findWord(const char *zText,
1874                    const char **pzWordStart, const char **pzWordEnd){
1875  int r;
1876  while( ascii_isspace(*zText) ){
1877    zText++;
1878  }
1879  *pzWordStart = zText;
1880  while( ascii_isalnum(*zText) || *zText=='_' ){
1881    zText++;
1882  }
1883  r = zText>*pzWordStart;  /* In case pzWordStart==pzWordEnd */
1884  *pzWordEnd = zText;
1885  return r;
1886}
1887
1888/* Return true if the next word in zText is zWord, also setting
1889 * *pzContinue to the character after the word.
1890 */
1891static int expectWord(const char *zText, const char *zWord,
1892                      const char **pzContinue){
1893  const char *zWordStart, *zWordEnd;
1894  if( findWord(zText, &zWordStart, &zWordEnd) &&
1895      ascii_strncasecmp(zWord, zWordStart, zWordEnd - zWordStart)==0 ){
1896    *pzContinue = zWordEnd;
1897    return 1;
1898  }
1899  return 0;
1900}
1901
1902/* Parse the name and type information out of parameter.  In case of
1903 * success, *pzNameStart/End contain the name of the column,
1904 * *pzTypeStart/End contain the top-level type, and *pTypeMask has the
1905 * type mask to use for the column.
1906 */
1907static int findNameAndType(const char *parameter,
1908                           const char **pzNameStart, const char **pzNameEnd,
1909                           const char **pzTypeStart, const char **pzTypeEnd,
1910                           unsigned char *pTypeMask){
1911  unsigned nNameLen;   /* Length of found name. */
1912  const char *zEnd;    /* Current end of parsed column information. */
1913  int bNotNull;        /* Non-zero if NULL is not allowed for name. */
1914  int bStrict;         /* Non-zero if column requires exact type match. */
1915  const char *zDummy;  /* Dummy parameter, result unused. */
1916  unsigned i;
1917
1918  /* strictMask is used for STRICT, strictMask|otherMask if STRICT is
1919   * not supplied.  zReplace provides an alternate type to expose to
1920   * the caller.
1921   */
1922  static struct {
1923    const char *zName;
1924    unsigned char strictMask;
1925    unsigned char otherMask;
1926    const char *zReplace;
1927  } kTypeInfo[] = {
1928    { "ANY",
1929      MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL,
1930      0, "",
1931    },
1932    { "ROWID",   MASK_INTEGER | MASK_ROWID,             0, "INTEGER", },
1933    { "INTEGER", MASK_INTEGER | MASK_NULL,              0, NULL, },
1934    { "FLOAT",   MASK_FLOAT | MASK_NULL,                MASK_INTEGER, NULL, },
1935    { "NUMERIC", MASK_INTEGER | MASK_FLOAT | MASK_NULL, MASK_TEXT, NULL, },
1936    { "TEXT",    MASK_TEXT | MASK_NULL,                 MASK_BLOB, NULL, },
1937    { "BLOB",    MASK_BLOB | MASK_NULL,                 0, NULL, },
1938  };
1939
1940  if( !findWord(parameter, pzNameStart, pzNameEnd) ){
1941    return SQLITE_MISUSE;
1942  }
1943
1944  /* Manifest typing, accept any storage type. */
1945  if( !findWord(*pzNameEnd, pzTypeStart, pzTypeEnd) ){
1946    *pzTypeEnd = *pzTypeStart = "";
1947    *pTypeMask = MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL;
1948    return SQLITE_OK;
1949  }
1950
1951  nNameLen = *pzTypeEnd - *pzTypeStart;
1952  for( i=0; i<ArraySize(kTypeInfo); ++i ){
1953    if( ascii_strncasecmp(kTypeInfo[i].zName, *pzTypeStart, nNameLen)==0 ){
1954      break;
1955    }
1956  }
1957  if( i==ArraySize(kTypeInfo) ){
1958    return SQLITE_MISUSE;
1959  }
1960
1961  zEnd = *pzTypeEnd;
1962  bStrict = 0;
1963  if( expectWord(zEnd, "STRICT", &zEnd) ){
1964    /* TODO(shess): Ick.  But I don't want another single-purpose
1965     * flag, either.
1966     */
1967    if( kTypeInfo[i].zReplace && !kTypeInfo[i].zReplace[0] ){
1968      return SQLITE_MISUSE;
1969    }
1970    bStrict = 1;
1971  }
1972
1973  bNotNull = 0;
1974  if( expectWord(zEnd, "NOT", &zEnd) ){
1975    if( expectWord(zEnd, "NULL", &zEnd) ){
1976      bNotNull = 1;
1977    }else{
1978      /* Anything other than NULL after NOT is an error. */
1979      return SQLITE_MISUSE;
1980    }
1981  }
1982
1983  /* Anything else is an error. */
1984  if( findWord(zEnd, &zDummy, &zDummy) ){
1985    return SQLITE_MISUSE;
1986  }
1987
1988  *pTypeMask = kTypeInfo[i].strictMask;
1989  if( !bStrict ){
1990    *pTypeMask |= kTypeInfo[i].otherMask;
1991  }
1992  if( bNotNull ){
1993    *pTypeMask &= ~MASK_NULL;
1994  }
1995  if( kTypeInfo[i].zReplace ){
1996    *pzTypeStart = kTypeInfo[i].zReplace;
1997    *pzTypeEnd = *pzTypeStart + strlen(*pzTypeStart);
1998  }
1999  return SQLITE_OK;
2000}
2001
2002/* Parse the arguments, placing type masks in *pTypes and the exposed
2003 * schema in *pzCreateSql (for sqlite3_declare_vtab).
2004 */
2005static int ParseColumnsAndGenerateCreate(unsigned nCols,
2006                                         const char *const *pCols,
2007                                         char **pzCreateSql,
2008                                         unsigned char *pTypes,
2009                                         char **pzErr){
2010  unsigned i;
2011  char *zCreateSql = sqlite3_mprintf("CREATE TABLE x(");
2012  if( !zCreateSql ){
2013    return SQLITE_NOMEM;
2014  }
2015
2016  for( i=0; i<nCols; i++ ){
2017    const char *zSep = (i < nCols - 1 ? ", " : ")");
2018    const char *zNotNull = "";
2019    const char *zNameStart, *zNameEnd;
2020    const char *zTypeStart, *zTypeEnd;
2021    int rc = findNameAndType(pCols[i],
2022                             &zNameStart, &zNameEnd,
2023                             &zTypeStart, &zTypeEnd,
2024                             &pTypes[i]);
2025    if( rc!=SQLITE_OK ){
2026      *pzErr = sqlite3_mprintf("unable to parse column %d", i);
2027      sqlite3_free(zCreateSql);
2028      return rc;
2029    }
2030
2031    if( !(pTypes[i]&MASK_NULL) ){
2032      zNotNull = " NOT NULL";
2033    }
2034
2035    /* Add name and type to the create statement. */
2036    zCreateSql = sqlite3_mprintf("%z%.*s %.*s%s%s",
2037                                 zCreateSql,
2038                                 zNameEnd - zNameStart, zNameStart,
2039                                 zTypeEnd - zTypeStart, zTypeStart,
2040                                 zNotNull, zSep);
2041    if( !zCreateSql ){
2042      return SQLITE_NOMEM;
2043    }
2044  }
2045
2046  *pzCreateSql = zCreateSql;
2047  return SQLITE_OK;
2048}
2049
2050/* Helper function for initializing the module. */
2051/* argv[0] module name
2052 * argv[1] db name for virtual table
2053 * argv[2] virtual table name
2054 * argv[3] backing table name
2055 * argv[4] columns
2056 */
2057/* TODO(shess): Since connect isn't supported, could inline into
2058 * recoverCreate().
2059 */
2060/* TODO(shess): Explore cases where it would make sense to set *pzErr. */
2061static int recoverInit(
2062  sqlite3 *db,                        /* Database connection */
2063  void *pAux,                         /* unused */
2064  int argc, const char *const*argv,   /* Parameters to CREATE TABLE statement */
2065  sqlite3_vtab **ppVtab,              /* OUT: New virtual table */
2066  char **pzErr                        /* OUT: Error message, if any */
2067){
2068  const unsigned kTypeCol = 4;  /* First argument with column type info. */
2069  Recover *pRecover;            /* Virtual table structure being created. */
2070  char *zDot;                   /* Any dot found in "db.table" backing. */
2071  u32 iRootPage;                /* Root page of backing table. */
2072  char *zCreateSql;             /* Schema of created virtual table. */
2073  int rc;
2074
2075  /* Require to be in the temp database. */
2076  if( ascii_strcasecmp(argv[1], "temp")!=0 ){
2077    *pzErr = sqlite3_mprintf("recover table must be in temp database");
2078    return SQLITE_MISUSE;
2079  }
2080
2081  /* Need the backing table and at least one column. */
2082  if( argc<=kTypeCol ){
2083    *pzErr = sqlite3_mprintf("no columns specified");
2084    return SQLITE_MISUSE;
2085  }
2086
2087  pRecover = sqlite3_malloc(sizeof(Recover));
2088  if( !pRecover ){
2089    return SQLITE_NOMEM;
2090  }
2091  memset(pRecover, 0, sizeof(*pRecover));
2092  pRecover->base.pModule = &recoverModule;
2093  pRecover->db = db;
2094
2095  /* Parse out db.table, assuming main if no dot. */
2096  zDot = strchr(argv[3], '.');
2097  if( !zDot ){
2098    pRecover->zDb = sqlite3_strdup(db->aDb[0].zName);
2099    pRecover->zTable = sqlite3_strdup(argv[3]);
2100  }else if( zDot>argv[3] && zDot[1]!='\0' ){
2101    pRecover->zDb = sqlite3_strndup(argv[3], zDot - argv[3]);
2102    pRecover->zTable = sqlite3_strdup(zDot + 1);
2103  }else{
2104    /* ".table" or "db." not allowed. */
2105    *pzErr = sqlite3_mprintf("ill-formed table specifier");
2106    recoverRelease(pRecover);
2107    return SQLITE_ERROR;
2108  }
2109
2110  pRecover->nCols = argc - kTypeCol;
2111  pRecover->pTypes = sqlite3_malloc(pRecover->nCols);
2112  if( !pRecover->zDb || !pRecover->zTable || !pRecover->pTypes ){
2113    recoverRelease(pRecover);
2114    return SQLITE_NOMEM;
2115  }
2116
2117  /* Require the backing table to exist. */
2118  /* TODO(shess): Be more pedantic about the form of the descriptor
2119   * string.  This already fails for poorly-formed strings, simply
2120   * because there won't be a root page, but it would make more sense
2121   * to be explicit.
2122   */
2123  rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable, &iRootPage);
2124  if( rc!=SQLITE_OK ){
2125    *pzErr = sqlite3_mprintf("unable to find backing table");
2126    recoverRelease(pRecover);
2127    return rc;
2128  }
2129
2130  /* Parse the column definitions. */
2131  rc = ParseColumnsAndGenerateCreate(pRecover->nCols, argv + kTypeCol,
2132                                     &zCreateSql, pRecover->pTypes, pzErr);
2133  if( rc!=SQLITE_OK ){
2134    recoverRelease(pRecover);
2135    return rc;
2136  }
2137
2138  rc = sqlite3_declare_vtab(db, zCreateSql);
2139  sqlite3_free(zCreateSql);
2140  if( rc!=SQLITE_OK ){
2141    recoverRelease(pRecover);
2142    return rc;
2143  }
2144
2145  *ppVtab = (sqlite3_vtab *)pRecover;
2146  return SQLITE_OK;
2147}
2148