1/* 2** 2004 April 6 3** 4** The author disclaims copyright to this source code. In place of 5** a legal notice, here is a blessing: 6** 7** May you do good and not evil. 8** May you find forgiveness for yourself and forgive others. 9** May you share freely, never taking more than you give. 10** 11************************************************************************* 12** This file implements a external (disk-based) database using BTrees. 13** See the header comment on "btreeInt.h" for additional information. 14** Including a description of file format and an overview of operation. 15*/ 16#include "btreeInt.h" 17 18/* 19** The header string that appears at the beginning of every 20** SQLite database. 21*/ 22static const char zMagicHeader[] = SQLITE_FILE_HEADER; 23 24/* 25** Set this global variable to 1 to enable tracing using the TRACE 26** macro. 27*/ 28#if 0 29int sqlite3BtreeTrace=1; /* True to enable tracing */ 30# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 31#else 32# define TRACE(X) 33#endif 34 35/* 36** Extract a 2-byte big-endian integer from an array of unsigned bytes. 37** But if the value is zero, make it 65536. 38** 39** This routine is used to extract the "offset to cell content area" value 40** from the header of a btree page. If the page size is 65536 and the page 41** is empty, the offset should be 65536, but the 2-byte value stores zero. 42** This routine makes the necessary adjustment to 65536. 43*/ 44#define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 45 46#ifndef SQLITE_OMIT_SHARED_CACHE 47/* 48** A list of BtShared objects that are eligible for participation 49** in shared cache. This variable has file scope during normal builds, 50** but the test harness needs to access it so we make it global for 51** test builds. 52** 53** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER. 54*/ 55#ifdef SQLITE_TEST 56BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 57#else 58static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 59#endif 60#endif /* SQLITE_OMIT_SHARED_CACHE */ 61 62#ifndef SQLITE_OMIT_SHARED_CACHE 63/* 64** Enable or disable the shared pager and schema features. 65** 66** This routine has no effect on existing database connections. 67** The shared cache setting effects only future calls to 68** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 69*/ 70int sqlite3_enable_shared_cache(int enable){ 71 sqlite3GlobalConfig.sharedCacheEnabled = enable; 72 return SQLITE_OK; 73} 74#endif 75 76 77 78#ifdef SQLITE_OMIT_SHARED_CACHE 79 /* 80 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 81 ** and clearAllSharedCacheTableLocks() 82 ** manipulate entries in the BtShared.pLock linked list used to store 83 ** shared-cache table level locks. If the library is compiled with the 84 ** shared-cache feature disabled, then there is only ever one user 85 ** of each BtShared structure and so this locking is not necessary. 86 ** So define the lock related functions as no-ops. 87 */ 88 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 89 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 90 #define clearAllSharedCacheTableLocks(a) 91 #define downgradeAllSharedCacheTableLocks(a) 92 #define hasSharedCacheTableLock(a,b,c,d) 1 93 #define hasReadConflicts(a, b) 0 94#endif 95 96#ifndef SQLITE_OMIT_SHARED_CACHE 97 98#ifdef SQLITE_DEBUG 99/* 100**** This function is only used as part of an assert() statement. *** 101** 102** Check to see if pBtree holds the required locks to read or write to the 103** table with root page iRoot. Return 1 if it does and 0 if not. 104** 105** For example, when writing to a table with root-page iRoot via 106** Btree connection pBtree: 107** 108** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 109** 110** When writing to an index that resides in a sharable database, the 111** caller should have first obtained a lock specifying the root page of 112** the corresponding table. This makes things a bit more complicated, 113** as this module treats each table as a separate structure. To determine 114** the table corresponding to the index being written, this 115** function has to search through the database schema. 116** 117** Instead of a lock on the table/index rooted at page iRoot, the caller may 118** hold a write-lock on the schema table (root page 1). This is also 119** acceptable. 120*/ 121static int hasSharedCacheTableLock( 122 Btree *pBtree, /* Handle that must hold lock */ 123 Pgno iRoot, /* Root page of b-tree */ 124 int isIndex, /* True if iRoot is the root of an index b-tree */ 125 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 126){ 127 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 128 Pgno iTab = 0; 129 BtLock *pLock; 130 131 /* If this database is not shareable, or if the client is reading 132 ** and has the read-uncommitted flag set, then no lock is required. 133 ** Return true immediately. 134 */ 135 if( (pBtree->sharable==0) 136 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted)) 137 ){ 138 return 1; 139 } 140 141 /* If the client is reading or writing an index and the schema is 142 ** not loaded, then it is too difficult to actually check to see if 143 ** the correct locks are held. So do not bother - just return true. 144 ** This case does not come up very often anyhow. 145 */ 146 if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){ 147 return 1; 148 } 149 150 /* Figure out the root-page that the lock should be held on. For table 151 ** b-trees, this is just the root page of the b-tree being read or 152 ** written. For index b-trees, it is the root page of the associated 153 ** table. */ 154 if( isIndex ){ 155 HashElem *p; 156 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 157 Index *pIdx = (Index *)sqliteHashData(p); 158 if( pIdx->tnum==(int)iRoot ){ 159 iTab = pIdx->pTable->tnum; 160 } 161 } 162 }else{ 163 iTab = iRoot; 164 } 165 166 /* Search for the required lock. Either a write-lock on root-page iTab, a 167 ** write-lock on the schema table, or (if the client is reading) a 168 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 169 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 170 if( pLock->pBtree==pBtree 171 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 172 && pLock->eLock>=eLockType 173 ){ 174 return 1; 175 } 176 } 177 178 /* Failed to find the required lock. */ 179 return 0; 180} 181#endif /* SQLITE_DEBUG */ 182 183#ifdef SQLITE_DEBUG 184/* 185**** This function may be used as part of assert() statements only. **** 186** 187** Return true if it would be illegal for pBtree to write into the 188** table or index rooted at iRoot because other shared connections are 189** simultaneously reading that same table or index. 190** 191** It is illegal for pBtree to write if some other Btree object that 192** shares the same BtShared object is currently reading or writing 193** the iRoot table. Except, if the other Btree object has the 194** read-uncommitted flag set, then it is OK for the other object to 195** have a read cursor. 196** 197** For example, before writing to any part of the table or index 198** rooted at page iRoot, one should call: 199** 200** assert( !hasReadConflicts(pBtree, iRoot) ); 201*/ 202static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 203 BtCursor *p; 204 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 205 if( p->pgnoRoot==iRoot 206 && p->pBtree!=pBtree 207 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted) 208 ){ 209 return 1; 210 } 211 } 212 return 0; 213} 214#endif /* #ifdef SQLITE_DEBUG */ 215 216/* 217** Query to see if Btree handle p may obtain a lock of type eLock 218** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 219** SQLITE_OK if the lock may be obtained (by calling 220** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 221*/ 222static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 223 BtShared *pBt = p->pBt; 224 BtLock *pIter; 225 226 assert( sqlite3BtreeHoldsMutex(p) ); 227 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 228 assert( p->db!=0 ); 229 assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 ); 230 231 /* If requesting a write-lock, then the Btree must have an open write 232 ** transaction on this file. And, obviously, for this to be so there 233 ** must be an open write transaction on the file itself. 234 */ 235 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 236 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 237 238 /* This routine is a no-op if the shared-cache is not enabled */ 239 if( !p->sharable ){ 240 return SQLITE_OK; 241 } 242 243 /* If some other connection is holding an exclusive lock, the 244 ** requested lock may not be obtained. 245 */ 246 if( pBt->pWriter!=p && pBt->isExclusive ){ 247 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 248 return SQLITE_LOCKED_SHAREDCACHE; 249 } 250 251 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 252 /* The condition (pIter->eLock!=eLock) in the following if(...) 253 ** statement is a simplification of: 254 ** 255 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 256 ** 257 ** since we know that if eLock==WRITE_LOCK, then no other connection 258 ** may hold a WRITE_LOCK on any table in this file (since there can 259 ** only be a single writer). 260 */ 261 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 262 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 263 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 264 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 265 if( eLock==WRITE_LOCK ){ 266 assert( p==pBt->pWriter ); 267 pBt->isPending = 1; 268 } 269 return SQLITE_LOCKED_SHAREDCACHE; 270 } 271 } 272 return SQLITE_OK; 273} 274#endif /* !SQLITE_OMIT_SHARED_CACHE */ 275 276#ifndef SQLITE_OMIT_SHARED_CACHE 277/* 278** Add a lock on the table with root-page iTable to the shared-btree used 279** by Btree handle p. Parameter eLock must be either READ_LOCK or 280** WRITE_LOCK. 281** 282** This function assumes the following: 283** 284** (a) The specified Btree object p is connected to a sharable 285** database (one with the BtShared.sharable flag set), and 286** 287** (b) No other Btree objects hold a lock that conflicts 288** with the requested lock (i.e. querySharedCacheTableLock() has 289** already been called and returned SQLITE_OK). 290** 291** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 292** is returned if a malloc attempt fails. 293*/ 294static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 295 BtShared *pBt = p->pBt; 296 BtLock *pLock = 0; 297 BtLock *pIter; 298 299 assert( sqlite3BtreeHoldsMutex(p) ); 300 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 301 assert( p->db!=0 ); 302 303 /* A connection with the read-uncommitted flag set will never try to 304 ** obtain a read-lock using this function. The only read-lock obtained 305 ** by a connection in read-uncommitted mode is on the sqlite_master 306 ** table, and that lock is obtained in BtreeBeginTrans(). */ 307 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK ); 308 309 /* This function should only be called on a sharable b-tree after it 310 ** has been determined that no other b-tree holds a conflicting lock. */ 311 assert( p->sharable ); 312 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 313 314 /* First search the list for an existing lock on this table. */ 315 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 316 if( pIter->iTable==iTable && pIter->pBtree==p ){ 317 pLock = pIter; 318 break; 319 } 320 } 321 322 /* If the above search did not find a BtLock struct associating Btree p 323 ** with table iTable, allocate one and link it into the list. 324 */ 325 if( !pLock ){ 326 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 327 if( !pLock ){ 328 return SQLITE_NOMEM; 329 } 330 pLock->iTable = iTable; 331 pLock->pBtree = p; 332 pLock->pNext = pBt->pLock; 333 pBt->pLock = pLock; 334 } 335 336 /* Set the BtLock.eLock variable to the maximum of the current lock 337 ** and the requested lock. This means if a write-lock was already held 338 ** and a read-lock requested, we don't incorrectly downgrade the lock. 339 */ 340 assert( WRITE_LOCK>READ_LOCK ); 341 if( eLock>pLock->eLock ){ 342 pLock->eLock = eLock; 343 } 344 345 return SQLITE_OK; 346} 347#endif /* !SQLITE_OMIT_SHARED_CACHE */ 348 349#ifndef SQLITE_OMIT_SHARED_CACHE 350/* 351** Release all the table locks (locks obtained via calls to 352** the setSharedCacheTableLock() procedure) held by Btree object p. 353** 354** This function assumes that Btree p has an open read or write 355** transaction. If it does not, then the BtShared.isPending variable 356** may be incorrectly cleared. 357*/ 358static void clearAllSharedCacheTableLocks(Btree *p){ 359 BtShared *pBt = p->pBt; 360 BtLock **ppIter = &pBt->pLock; 361 362 assert( sqlite3BtreeHoldsMutex(p) ); 363 assert( p->sharable || 0==*ppIter ); 364 assert( p->inTrans>0 ); 365 366 while( *ppIter ){ 367 BtLock *pLock = *ppIter; 368 assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree ); 369 assert( pLock->pBtree->inTrans>=pLock->eLock ); 370 if( pLock->pBtree==p ){ 371 *ppIter = pLock->pNext; 372 assert( pLock->iTable!=1 || pLock==&p->lock ); 373 if( pLock->iTable!=1 ){ 374 sqlite3_free(pLock); 375 } 376 }else{ 377 ppIter = &pLock->pNext; 378 } 379 } 380 381 assert( pBt->isPending==0 || pBt->pWriter ); 382 if( pBt->pWriter==p ){ 383 pBt->pWriter = 0; 384 pBt->isExclusive = 0; 385 pBt->isPending = 0; 386 }else if( pBt->nTransaction==2 ){ 387 /* This function is called when Btree p is concluding its 388 ** transaction. If there currently exists a writer, and p is not 389 ** that writer, then the number of locks held by connections other 390 ** than the writer must be about to drop to zero. In this case 391 ** set the isPending flag to 0. 392 ** 393 ** If there is not currently a writer, then BtShared.isPending must 394 ** be zero already. So this next line is harmless in that case. 395 */ 396 pBt->isPending = 0; 397 } 398} 399 400/* 401** This function changes all write-locks held by Btree p into read-locks. 402*/ 403static void downgradeAllSharedCacheTableLocks(Btree *p){ 404 BtShared *pBt = p->pBt; 405 if( pBt->pWriter==p ){ 406 BtLock *pLock; 407 pBt->pWriter = 0; 408 pBt->isExclusive = 0; 409 pBt->isPending = 0; 410 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 411 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 412 pLock->eLock = READ_LOCK; 413 } 414 } 415} 416 417#endif /* SQLITE_OMIT_SHARED_CACHE */ 418 419static void releasePage(MemPage *pPage); /* Forward reference */ 420 421/* 422***** This routine is used inside of assert() only **** 423** 424** Verify that the cursor holds the mutex on its BtShared 425*/ 426#ifdef SQLITE_DEBUG 427static int cursorHoldsMutex(BtCursor *p){ 428 return sqlite3_mutex_held(p->pBt->mutex); 429} 430#endif 431 432 433#ifndef SQLITE_OMIT_INCRBLOB 434/* 435** Invalidate the overflow page-list cache for cursor pCur, if any. 436*/ 437static void invalidateOverflowCache(BtCursor *pCur){ 438 assert( cursorHoldsMutex(pCur) ); 439 sqlite3_free(pCur->aOverflow); 440 pCur->aOverflow = 0; 441} 442 443/* 444** Invalidate the overflow page-list cache for all cursors opened 445** on the shared btree structure pBt. 446*/ 447static void invalidateAllOverflowCache(BtShared *pBt){ 448 BtCursor *p; 449 assert( sqlite3_mutex_held(pBt->mutex) ); 450 for(p=pBt->pCursor; p; p=p->pNext){ 451 invalidateOverflowCache(p); 452 } 453} 454 455/* 456** This function is called before modifying the contents of a table 457** to invalidate any incrblob cursors that are open on the 458** row or one of the rows being modified. 459** 460** If argument isClearTable is true, then the entire contents of the 461** table is about to be deleted. In this case invalidate all incrblob 462** cursors open on any row within the table with root-page pgnoRoot. 463** 464** Otherwise, if argument isClearTable is false, then the row with 465** rowid iRow is being replaced or deleted. In this case invalidate 466** only those incrblob cursors open on that specific row. 467*/ 468static void invalidateIncrblobCursors( 469 Btree *pBtree, /* The database file to check */ 470 i64 iRow, /* The rowid that might be changing */ 471 int isClearTable /* True if all rows are being deleted */ 472){ 473 BtCursor *p; 474 BtShared *pBt = pBtree->pBt; 475 assert( sqlite3BtreeHoldsMutex(pBtree) ); 476 for(p=pBt->pCursor; p; p=p->pNext){ 477 if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){ 478 p->eState = CURSOR_INVALID; 479 } 480 } 481} 482 483#else 484 /* Stub functions when INCRBLOB is omitted */ 485 #define invalidateOverflowCache(x) 486 #define invalidateAllOverflowCache(x) 487 #define invalidateIncrblobCursors(x,y,z) 488#endif /* SQLITE_OMIT_INCRBLOB */ 489 490/* 491** Set bit pgno of the BtShared.pHasContent bitvec. This is called 492** when a page that previously contained data becomes a free-list leaf 493** page. 494** 495** The BtShared.pHasContent bitvec exists to work around an obscure 496** bug caused by the interaction of two useful IO optimizations surrounding 497** free-list leaf pages: 498** 499** 1) When all data is deleted from a page and the page becomes 500** a free-list leaf page, the page is not written to the database 501** (as free-list leaf pages contain no meaningful data). Sometimes 502** such a page is not even journalled (as it will not be modified, 503** why bother journalling it?). 504** 505** 2) When a free-list leaf page is reused, its content is not read 506** from the database or written to the journal file (why should it 507** be, if it is not at all meaningful?). 508** 509** By themselves, these optimizations work fine and provide a handy 510** performance boost to bulk delete or insert operations. However, if 511** a page is moved to the free-list and then reused within the same 512** transaction, a problem comes up. If the page is not journalled when 513** it is moved to the free-list and it is also not journalled when it 514** is extracted from the free-list and reused, then the original data 515** may be lost. In the event of a rollback, it may not be possible 516** to restore the database to its original configuration. 517** 518** The solution is the BtShared.pHasContent bitvec. Whenever a page is 519** moved to become a free-list leaf page, the corresponding bit is 520** set in the bitvec. Whenever a leaf page is extracted from the free-list, 521** optimization 2 above is omitted if the corresponding bit is already 522** set in BtShared.pHasContent. The contents of the bitvec are cleared 523** at the end of every transaction. 524*/ 525static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 526 int rc = SQLITE_OK; 527 if( !pBt->pHasContent ){ 528 assert( pgno<=pBt->nPage ); 529 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 530 if( !pBt->pHasContent ){ 531 rc = SQLITE_NOMEM; 532 } 533 } 534 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 535 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 536 } 537 return rc; 538} 539 540/* 541** Query the BtShared.pHasContent vector. 542** 543** This function is called when a free-list leaf page is removed from the 544** free-list for reuse. It returns false if it is safe to retrieve the 545** page from the pager layer with the 'no-content' flag set. True otherwise. 546*/ 547static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 548 Bitvec *p = pBt->pHasContent; 549 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); 550} 551 552/* 553** Clear (destroy) the BtShared.pHasContent bitvec. This should be 554** invoked at the conclusion of each write-transaction. 555*/ 556static void btreeClearHasContent(BtShared *pBt){ 557 sqlite3BitvecDestroy(pBt->pHasContent); 558 pBt->pHasContent = 0; 559} 560 561/* 562** Save the current cursor position in the variables BtCursor.nKey 563** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 564** 565** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 566** prior to calling this routine. 567*/ 568static int saveCursorPosition(BtCursor *pCur){ 569 int rc; 570 571 assert( CURSOR_VALID==pCur->eState ); 572 assert( 0==pCur->pKey ); 573 assert( cursorHoldsMutex(pCur) ); 574 575 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); 576 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */ 577 578 /* If this is an intKey table, then the above call to BtreeKeySize() 579 ** stores the integer key in pCur->nKey. In this case this value is 580 ** all that is required. Otherwise, if pCur is not open on an intKey 581 ** table, then malloc space for and store the pCur->nKey bytes of key 582 ** data. 583 */ 584 if( 0==pCur->apPage[0]->intKey ){ 585 void *pKey = sqlite3Malloc( (int)pCur->nKey ); 586 if( pKey ){ 587 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); 588 if( rc==SQLITE_OK ){ 589 pCur->pKey = pKey; 590 }else{ 591 sqlite3_free(pKey); 592 } 593 }else{ 594 rc = SQLITE_NOMEM; 595 } 596 } 597 assert( !pCur->apPage[0]->intKey || !pCur->pKey ); 598 599 if( rc==SQLITE_OK ){ 600 int i; 601 for(i=0; i<=pCur->iPage; i++){ 602 releasePage(pCur->apPage[i]); 603 pCur->apPage[i] = 0; 604 } 605 pCur->iPage = -1; 606 pCur->eState = CURSOR_REQUIRESEEK; 607 } 608 609 invalidateOverflowCache(pCur); 610 return rc; 611} 612 613/* 614** Save the positions of all cursors (except pExcept) that are open on 615** the table with root-page iRoot. Usually, this is called just before cursor 616** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). 617*/ 618static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 619 BtCursor *p; 620 assert( sqlite3_mutex_held(pBt->mutex) ); 621 assert( pExcept==0 || pExcept->pBt==pBt ); 622 for(p=pBt->pCursor; p; p=p->pNext){ 623 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 624 p->eState==CURSOR_VALID ){ 625 int rc = saveCursorPosition(p); 626 if( SQLITE_OK!=rc ){ 627 return rc; 628 } 629 } 630 } 631 return SQLITE_OK; 632} 633 634/* 635** Clear the current cursor position. 636*/ 637void sqlite3BtreeClearCursor(BtCursor *pCur){ 638 assert( cursorHoldsMutex(pCur) ); 639 sqlite3_free(pCur->pKey); 640 pCur->pKey = 0; 641 pCur->eState = CURSOR_INVALID; 642} 643 644/* 645** In this version of BtreeMoveto, pKey is a packed index record 646** such as is generated by the OP_MakeRecord opcode. Unpack the 647** record and then call BtreeMovetoUnpacked() to do the work. 648*/ 649static int btreeMoveto( 650 BtCursor *pCur, /* Cursor open on the btree to be searched */ 651 const void *pKey, /* Packed key if the btree is an index */ 652 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 653 int bias, /* Bias search to the high end */ 654 int *pRes /* Write search results here */ 655){ 656 int rc; /* Status code */ 657 UnpackedRecord *pIdxKey; /* Unpacked index key */ 658 char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */ 659 660 if( pKey ){ 661 assert( nKey==(i64)(int)nKey ); 662 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, 663 aSpace, sizeof(aSpace)); 664 if( pIdxKey==0 ) return SQLITE_NOMEM; 665 }else{ 666 pIdxKey = 0; 667 } 668 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); 669 if( pKey ){ 670 sqlite3VdbeDeleteUnpackedRecord(pIdxKey); 671 } 672 return rc; 673} 674 675/* 676** Restore the cursor to the position it was in (or as close to as possible) 677** when saveCursorPosition() was called. Note that this call deletes the 678** saved position info stored by saveCursorPosition(), so there can be 679** at most one effective restoreCursorPosition() call after each 680** saveCursorPosition(). 681*/ 682static int btreeRestoreCursorPosition(BtCursor *pCur){ 683 int rc; 684 assert( cursorHoldsMutex(pCur) ); 685 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 686 if( pCur->eState==CURSOR_FAULT ){ 687 return pCur->skipNext; 688 } 689 pCur->eState = CURSOR_INVALID; 690 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext); 691 if( rc==SQLITE_OK ){ 692 sqlite3_free(pCur->pKey); 693 pCur->pKey = 0; 694 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 695 } 696 return rc; 697} 698 699#define restoreCursorPosition(p) \ 700 (p->eState>=CURSOR_REQUIRESEEK ? \ 701 btreeRestoreCursorPosition(p) : \ 702 SQLITE_OK) 703 704/* 705** Determine whether or not a cursor has moved from the position it 706** was last placed at. Cursors can move when the row they are pointing 707** at is deleted out from under them. 708** 709** This routine returns an error code if something goes wrong. The 710** integer *pHasMoved is set to one if the cursor has moved and 0 if not. 711*/ 712int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ 713 int rc; 714 715 rc = restoreCursorPosition(pCur); 716 if( rc ){ 717 *pHasMoved = 1; 718 return rc; 719 } 720 if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){ 721 *pHasMoved = 1; 722 }else{ 723 *pHasMoved = 0; 724 } 725 return SQLITE_OK; 726} 727 728#ifndef SQLITE_OMIT_AUTOVACUUM 729/* 730** Given a page number of a regular database page, return the page 731** number for the pointer-map page that contains the entry for the 732** input page number. 733** 734** Return 0 (not a valid page) for pgno==1 since there is 735** no pointer map associated with page 1. The integrity_check logic 736** requires that ptrmapPageno(*,1)!=1. 737*/ 738static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 739 int nPagesPerMapPage; 740 Pgno iPtrMap, ret; 741 assert( sqlite3_mutex_held(pBt->mutex) ); 742 if( pgno<2 ) return 0; 743 nPagesPerMapPage = (pBt->usableSize/5)+1; 744 iPtrMap = (pgno-2)/nPagesPerMapPage; 745 ret = (iPtrMap*nPagesPerMapPage) + 2; 746 if( ret==PENDING_BYTE_PAGE(pBt) ){ 747 ret++; 748 } 749 return ret; 750} 751 752/* 753** Write an entry into the pointer map. 754** 755** This routine updates the pointer map entry for page number 'key' 756** so that it maps to type 'eType' and parent page number 'pgno'. 757** 758** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 759** a no-op. If an error occurs, the appropriate error code is written 760** into *pRC. 761*/ 762static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 763 DbPage *pDbPage; /* The pointer map page */ 764 u8 *pPtrmap; /* The pointer map data */ 765 Pgno iPtrmap; /* The pointer map page number */ 766 int offset; /* Offset in pointer map page */ 767 int rc; /* Return code from subfunctions */ 768 769 if( *pRC ) return; 770 771 assert( sqlite3_mutex_held(pBt->mutex) ); 772 /* The master-journal page number must never be used as a pointer map page */ 773 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 774 775 assert( pBt->autoVacuum ); 776 if( key==0 ){ 777 *pRC = SQLITE_CORRUPT_BKPT; 778 return; 779 } 780 iPtrmap = PTRMAP_PAGENO(pBt, key); 781 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 782 if( rc!=SQLITE_OK ){ 783 *pRC = rc; 784 return; 785 } 786 offset = PTRMAP_PTROFFSET(iPtrmap, key); 787 if( offset<0 ){ 788 *pRC = SQLITE_CORRUPT_BKPT; 789 goto ptrmap_exit; 790 } 791 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 792 793 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 794 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); 795 *pRC= rc = sqlite3PagerWrite(pDbPage); 796 if( rc==SQLITE_OK ){ 797 pPtrmap[offset] = eType; 798 put4byte(&pPtrmap[offset+1], parent); 799 } 800 } 801 802ptrmap_exit: 803 sqlite3PagerUnref(pDbPage); 804} 805 806/* 807** Read an entry from the pointer map. 808** 809** This routine retrieves the pointer map entry for page 'key', writing 810** the type and parent page number to *pEType and *pPgno respectively. 811** An error code is returned if something goes wrong, otherwise SQLITE_OK. 812*/ 813static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 814 DbPage *pDbPage; /* The pointer map page */ 815 int iPtrmap; /* Pointer map page index */ 816 u8 *pPtrmap; /* Pointer map page data */ 817 int offset; /* Offset of entry in pointer map */ 818 int rc; 819 820 assert( sqlite3_mutex_held(pBt->mutex) ); 821 822 iPtrmap = PTRMAP_PAGENO(pBt, key); 823 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); 824 if( rc!=0 ){ 825 return rc; 826 } 827 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 828 829 offset = PTRMAP_PTROFFSET(iPtrmap, key); 830 assert( pEType!=0 ); 831 *pEType = pPtrmap[offset]; 832 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 833 834 sqlite3PagerUnref(pDbPage); 835 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; 836 return SQLITE_OK; 837} 838 839#else /* if defined SQLITE_OMIT_AUTOVACUUM */ 840 #define ptrmapPut(w,x,y,z,rc) 841 #define ptrmapGet(w,x,y,z) SQLITE_OK 842 #define ptrmapPutOvflPtr(x, y, rc) 843#endif 844 845/* 846** Given a btree page and a cell index (0 means the first cell on 847** the page, 1 means the second cell, and so forth) return a pointer 848** to the cell content. 849** 850** This routine works only for pages that do not contain overflow cells. 851*/ 852#define findCell(P,I) \ 853 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)]))) 854 855/* 856** This a more complex version of findCell() that works for 857** pages that do contain overflow cells. 858*/ 859static u8 *findOverflowCell(MemPage *pPage, int iCell){ 860 int i; 861 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 862 for(i=pPage->nOverflow-1; i>=0; i--){ 863 int k; 864 struct _OvflCell *pOvfl; 865 pOvfl = &pPage->aOvfl[i]; 866 k = pOvfl->idx; 867 if( k<=iCell ){ 868 if( k==iCell ){ 869 return pOvfl->pCell; 870 } 871 iCell--; 872 } 873 } 874 return findCell(pPage, iCell); 875} 876 877/* 878** Parse a cell content block and fill in the CellInfo structure. There 879** are two versions of this function. btreeParseCell() takes a 880** cell index as the second argument and btreeParseCellPtr() 881** takes a pointer to the body of the cell as its second argument. 882** 883** Within this file, the parseCell() macro can be called instead of 884** btreeParseCellPtr(). Using some compilers, this will be faster. 885*/ 886static void btreeParseCellPtr( 887 MemPage *pPage, /* Page containing the cell */ 888 u8 *pCell, /* Pointer to the cell text. */ 889 CellInfo *pInfo /* Fill in this structure */ 890){ 891 u16 n; /* Number bytes in cell content header */ 892 u32 nPayload; /* Number of bytes of cell payload */ 893 894 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 895 896 pInfo->pCell = pCell; 897 assert( pPage->leaf==0 || pPage->leaf==1 ); 898 n = pPage->childPtrSize; 899 assert( n==4-4*pPage->leaf ); 900 if( pPage->intKey ){ 901 if( pPage->hasData ){ 902 n += getVarint32(&pCell[n], nPayload); 903 }else{ 904 nPayload = 0; 905 } 906 n += getVarint(&pCell[n], (u64*)&pInfo->nKey); 907 pInfo->nData = nPayload; 908 }else{ 909 pInfo->nData = 0; 910 n += getVarint32(&pCell[n], nPayload); 911 pInfo->nKey = nPayload; 912 } 913 pInfo->nPayload = nPayload; 914 pInfo->nHeader = n; 915 testcase( nPayload==pPage->maxLocal ); 916 testcase( nPayload==pPage->maxLocal+1 ); 917 if( likely(nPayload<=pPage->maxLocal) ){ 918 /* This is the (easy) common case where the entire payload fits 919 ** on the local page. No overflow is required. 920 */ 921 if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4; 922 pInfo->nLocal = (u16)nPayload; 923 pInfo->iOverflow = 0; 924 }else{ 925 /* If the payload will not fit completely on the local page, we have 926 ** to decide how much to store locally and how much to spill onto 927 ** overflow pages. The strategy is to minimize the amount of unused 928 ** space on overflow pages while keeping the amount of local storage 929 ** in between minLocal and maxLocal. 930 ** 931 ** Warning: changing the way overflow payload is distributed in any 932 ** way will result in an incompatible file format. 933 */ 934 int minLocal; /* Minimum amount of payload held locally */ 935 int maxLocal; /* Maximum amount of payload held locally */ 936 int surplus; /* Overflow payload available for local storage */ 937 938 minLocal = pPage->minLocal; 939 maxLocal = pPage->maxLocal; 940 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); 941 testcase( surplus==maxLocal ); 942 testcase( surplus==maxLocal+1 ); 943 if( surplus <= maxLocal ){ 944 pInfo->nLocal = (u16)surplus; 945 }else{ 946 pInfo->nLocal = (u16)minLocal; 947 } 948 pInfo->iOverflow = (u16)(pInfo->nLocal + n); 949 pInfo->nSize = pInfo->iOverflow + 4; 950 } 951} 952#define parseCell(pPage, iCell, pInfo) \ 953 btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) 954static void btreeParseCell( 955 MemPage *pPage, /* Page containing the cell */ 956 int iCell, /* The cell index. First cell is 0 */ 957 CellInfo *pInfo /* Fill in this structure */ 958){ 959 parseCell(pPage, iCell, pInfo); 960} 961 962/* 963** Compute the total number of bytes that a Cell needs in the cell 964** data area of the btree-page. The return number includes the cell 965** data header and the local payload, but not any overflow page or 966** the space used by the cell pointer. 967*/ 968static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 969 u8 *pIter = &pCell[pPage->childPtrSize]; 970 u32 nSize; 971 972#ifdef SQLITE_DEBUG 973 /* The value returned by this function should always be the same as 974 ** the (CellInfo.nSize) value found by doing a full parse of the 975 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 976 ** this function verifies that this invariant is not violated. */ 977 CellInfo debuginfo; 978 btreeParseCellPtr(pPage, pCell, &debuginfo); 979#endif 980 981 if( pPage->intKey ){ 982 u8 *pEnd; 983 if( pPage->hasData ){ 984 pIter += getVarint32(pIter, nSize); 985 }else{ 986 nSize = 0; 987 } 988 989 /* pIter now points at the 64-bit integer key value, a variable length 990 ** integer. The following block moves pIter to point at the first byte 991 ** past the end of the key value. */ 992 pEnd = &pIter[9]; 993 while( (*pIter++)&0x80 && pIter<pEnd ); 994 }else{ 995 pIter += getVarint32(pIter, nSize); 996 } 997 998 testcase( nSize==pPage->maxLocal ); 999 testcase( nSize==pPage->maxLocal+1 ); 1000 if( nSize>pPage->maxLocal ){ 1001 int minLocal = pPage->minLocal; 1002 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 1003 testcase( nSize==pPage->maxLocal ); 1004 testcase( nSize==pPage->maxLocal+1 ); 1005 if( nSize>pPage->maxLocal ){ 1006 nSize = minLocal; 1007 } 1008 nSize += 4; 1009 } 1010 nSize += (u32)(pIter - pCell); 1011 1012 /* The minimum size of any cell is 4 bytes. */ 1013 if( nSize<4 ){ 1014 nSize = 4; 1015 } 1016 1017 assert( nSize==debuginfo.nSize ); 1018 return (u16)nSize; 1019} 1020 1021#ifdef SQLITE_DEBUG 1022/* This variation on cellSizePtr() is used inside of assert() statements 1023** only. */ 1024static u16 cellSize(MemPage *pPage, int iCell){ 1025 return cellSizePtr(pPage, findCell(pPage, iCell)); 1026} 1027#endif 1028 1029#ifndef SQLITE_OMIT_AUTOVACUUM 1030/* 1031** If the cell pCell, part of page pPage contains a pointer 1032** to an overflow page, insert an entry into the pointer-map 1033** for the overflow page. 1034*/ 1035static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ 1036 CellInfo info; 1037 if( *pRC ) return; 1038 assert( pCell!=0 ); 1039 btreeParseCellPtr(pPage, pCell, &info); 1040 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); 1041 if( info.iOverflow ){ 1042 Pgno ovfl = get4byte(&pCell[info.iOverflow]); 1043 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 1044 } 1045} 1046#endif 1047 1048 1049/* 1050** Defragment the page given. All Cells are moved to the 1051** end of the page and all free space is collected into one 1052** big FreeBlk that occurs in between the header and cell 1053** pointer array and the cell content area. 1054*/ 1055static int defragmentPage(MemPage *pPage){ 1056 int i; /* Loop counter */ 1057 int pc; /* Address of a i-th cell */ 1058 int hdr; /* Offset to the page header */ 1059 int size; /* Size of a cell */ 1060 int usableSize; /* Number of usable bytes on a page */ 1061 int cellOffset; /* Offset to the cell pointer array */ 1062 int cbrk; /* Offset to the cell content area */ 1063 int nCell; /* Number of cells on the page */ 1064 unsigned char *data; /* The page data */ 1065 unsigned char *temp; /* Temp area for cell content */ 1066 int iCellFirst; /* First allowable cell index */ 1067 int iCellLast; /* Last possible cell index */ 1068 1069 1070 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1071 assert( pPage->pBt!=0 ); 1072 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 1073 assert( pPage->nOverflow==0 ); 1074 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1075 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 1076 data = pPage->aData; 1077 hdr = pPage->hdrOffset; 1078 cellOffset = pPage->cellOffset; 1079 nCell = pPage->nCell; 1080 assert( nCell==get2byte(&data[hdr+3]) ); 1081 usableSize = pPage->pBt->usableSize; 1082 cbrk = get2byte(&data[hdr+5]); 1083 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); 1084 cbrk = usableSize; 1085 iCellFirst = cellOffset + 2*nCell; 1086 iCellLast = usableSize - 4; 1087 for(i=0; i<nCell; i++){ 1088 u8 *pAddr; /* The i-th cell pointer */ 1089 pAddr = &data[cellOffset + i*2]; 1090 pc = get2byte(pAddr); 1091 testcase( pc==iCellFirst ); 1092 testcase( pc==iCellLast ); 1093#if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1094 /* These conditions have already been verified in btreeInitPage() 1095 ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined 1096 */ 1097 if( pc<iCellFirst || pc>iCellLast ){ 1098 return SQLITE_CORRUPT_BKPT; 1099 } 1100#endif 1101 assert( pc>=iCellFirst && pc<=iCellLast ); 1102 size = cellSizePtr(pPage, &temp[pc]); 1103 cbrk -= size; 1104#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1105 if( cbrk<iCellFirst ){ 1106 return SQLITE_CORRUPT_BKPT; 1107 } 1108#else 1109 if( cbrk<iCellFirst || pc+size>usableSize ){ 1110 return SQLITE_CORRUPT_BKPT; 1111 } 1112#endif 1113 assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); 1114 testcase( cbrk+size==usableSize ); 1115 testcase( pc+size==usableSize ); 1116 memcpy(&data[cbrk], &temp[pc], size); 1117 put2byte(pAddr, cbrk); 1118 } 1119 assert( cbrk>=iCellFirst ); 1120 put2byte(&data[hdr+5], cbrk); 1121 data[hdr+1] = 0; 1122 data[hdr+2] = 0; 1123 data[hdr+7] = 0; 1124 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 1125 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1126 if( cbrk-iCellFirst!=pPage->nFree ){ 1127 return SQLITE_CORRUPT_BKPT; 1128 } 1129 return SQLITE_OK; 1130} 1131 1132/* 1133** Allocate nByte bytes of space from within the B-Tree page passed 1134** as the first argument. Write into *pIdx the index into pPage->aData[] 1135** of the first byte of allocated space. Return either SQLITE_OK or 1136** an error code (usually SQLITE_CORRUPT). 1137** 1138** The caller guarantees that there is sufficient space to make the 1139** allocation. This routine might need to defragment in order to bring 1140** all the space together, however. This routine will avoid using 1141** the first two bytes past the cell pointer area since presumably this 1142** allocation is being made in order to insert a new cell, so we will 1143** also end up needing a new cell pointer. 1144*/ 1145static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 1146 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 1147 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 1148 int nFrag; /* Number of fragmented bytes on pPage */ 1149 int top; /* First byte of cell content area */ 1150 int gap; /* First byte of gap between cell pointers and cell content */ 1151 int rc; /* Integer return code */ 1152 int usableSize; /* Usable size of the page */ 1153 1154 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1155 assert( pPage->pBt ); 1156 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1157 assert( nByte>=0 ); /* Minimum cell size is 4 */ 1158 assert( pPage->nFree>=nByte ); 1159 assert( pPage->nOverflow==0 ); 1160 usableSize = pPage->pBt->usableSize; 1161 assert( nByte < usableSize-8 ); 1162 1163 nFrag = data[hdr+7]; 1164 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 1165 gap = pPage->cellOffset + 2*pPage->nCell; 1166 top = get2byteNotZero(&data[hdr+5]); 1167 if( gap>top ) return SQLITE_CORRUPT_BKPT; 1168 testcase( gap+2==top ); 1169 testcase( gap+1==top ); 1170 testcase( gap==top ); 1171 1172 if( nFrag>=60 ){ 1173 /* Always defragment highly fragmented pages */ 1174 rc = defragmentPage(pPage); 1175 if( rc ) return rc; 1176 top = get2byteNotZero(&data[hdr+5]); 1177 }else if( gap+2<=top ){ 1178 /* Search the freelist looking for a free slot big enough to satisfy 1179 ** the request. The allocation is made from the first free slot in 1180 ** the list that is large enough to accomadate it. 1181 */ 1182 int pc, addr; 1183 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ 1184 int size; /* Size of the free slot */ 1185 if( pc>usableSize-4 || pc<addr+4 ){ 1186 return SQLITE_CORRUPT_BKPT; 1187 } 1188 size = get2byte(&data[pc+2]); 1189 if( size>=nByte ){ 1190 int x = size - nByte; 1191 testcase( x==4 ); 1192 testcase( x==3 ); 1193 if( x<4 ){ 1194 /* Remove the slot from the free-list. Update the number of 1195 ** fragmented bytes within the page. */ 1196 memcpy(&data[addr], &data[pc], 2); 1197 data[hdr+7] = (u8)(nFrag + x); 1198 }else if( size+pc > usableSize ){ 1199 return SQLITE_CORRUPT_BKPT; 1200 }else{ 1201 /* The slot remains on the free-list. Reduce its size to account 1202 ** for the portion used by the new allocation. */ 1203 put2byte(&data[pc+2], x); 1204 } 1205 *pIdx = pc + x; 1206 return SQLITE_OK; 1207 } 1208 } 1209 } 1210 1211 /* Check to make sure there is enough space in the gap to satisfy 1212 ** the allocation. If not, defragment. 1213 */ 1214 testcase( gap+2+nByte==top ); 1215 if( gap+2+nByte>top ){ 1216 rc = defragmentPage(pPage); 1217 if( rc ) return rc; 1218 top = get2byteNotZero(&data[hdr+5]); 1219 assert( gap+nByte<=top ); 1220 } 1221 1222 1223 /* Allocate memory from the gap in between the cell pointer array 1224 ** and the cell content area. The btreeInitPage() call has already 1225 ** validated the freelist. Given that the freelist is valid, there 1226 ** is no way that the allocation can extend off the end of the page. 1227 ** The assert() below verifies the previous sentence. 1228 */ 1229 top -= nByte; 1230 put2byte(&data[hdr+5], top); 1231 assert( top+nByte <= (int)pPage->pBt->usableSize ); 1232 *pIdx = top; 1233 return SQLITE_OK; 1234} 1235 1236/* 1237** Return a section of the pPage->aData to the freelist. 1238** The first byte of the new free block is pPage->aDisk[start] 1239** and the size of the block is "size" bytes. 1240** 1241** Most of the effort here is involved in coalesing adjacent 1242** free blocks into a single big free block. 1243*/ 1244static int freeSpace(MemPage *pPage, int start, int size){ 1245 int addr, pbegin, hdr; 1246 int iLast; /* Largest possible freeblock offset */ 1247 unsigned char *data = pPage->aData; 1248 1249 assert( pPage->pBt!=0 ); 1250 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1251 assert( start>=pPage->hdrOffset+6+pPage->childPtrSize ); 1252 assert( (start + size) <= (int)pPage->pBt->usableSize ); 1253 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1254 assert( size>=0 ); /* Minimum cell size is 4 */ 1255 1256 if( pPage->pBt->secureDelete ){ 1257 /* Overwrite deleted information with zeros when the secure_delete 1258 ** option is enabled */ 1259 memset(&data[start], 0, size); 1260 } 1261 1262 /* Add the space back into the linked list of freeblocks. Note that 1263 ** even though the freeblock list was checked by btreeInitPage(), 1264 ** btreeInitPage() did not detect overlapping cells or 1265 ** freeblocks that overlapped cells. Nor does it detect when the 1266 ** cell content area exceeds the value in the page header. If these 1267 ** situations arise, then subsequent insert operations might corrupt 1268 ** the freelist. So we do need to check for corruption while scanning 1269 ** the freelist. 1270 */ 1271 hdr = pPage->hdrOffset; 1272 addr = hdr + 1; 1273 iLast = pPage->pBt->usableSize - 4; 1274 assert( start<=iLast ); 1275 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ 1276 if( pbegin<addr+4 ){ 1277 return SQLITE_CORRUPT_BKPT; 1278 } 1279 addr = pbegin; 1280 } 1281 if( pbegin>iLast ){ 1282 return SQLITE_CORRUPT_BKPT; 1283 } 1284 assert( pbegin>addr || pbegin==0 ); 1285 put2byte(&data[addr], start); 1286 put2byte(&data[start], pbegin); 1287 put2byte(&data[start+2], size); 1288 pPage->nFree = pPage->nFree + (u16)size; 1289 1290 /* Coalesce adjacent free blocks */ 1291 addr = hdr + 1; 1292 while( (pbegin = get2byte(&data[addr]))>0 ){ 1293 int pnext, psize, x; 1294 assert( pbegin>addr ); 1295 assert( pbegin <= (int)pPage->pBt->usableSize-4 ); 1296 pnext = get2byte(&data[pbegin]); 1297 psize = get2byte(&data[pbegin+2]); 1298 if( pbegin + psize + 3 >= pnext && pnext>0 ){ 1299 int frag = pnext - (pbegin+psize); 1300 if( (frag<0) || (frag>(int)data[hdr+7]) ){ 1301 return SQLITE_CORRUPT_BKPT; 1302 } 1303 data[hdr+7] -= (u8)frag; 1304 x = get2byte(&data[pnext]); 1305 put2byte(&data[pbegin], x); 1306 x = pnext + get2byte(&data[pnext+2]) - pbegin; 1307 put2byte(&data[pbegin+2], x); 1308 }else{ 1309 addr = pbegin; 1310 } 1311 } 1312 1313 /* If the cell content area begins with a freeblock, remove it. */ 1314 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ 1315 int top; 1316 pbegin = get2byte(&data[hdr+1]); 1317 memcpy(&data[hdr+1], &data[pbegin], 2); 1318 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]); 1319 put2byte(&data[hdr+5], top); 1320 } 1321 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1322 return SQLITE_OK; 1323} 1324 1325/* 1326** Decode the flags byte (the first byte of the header) for a page 1327** and initialize fields of the MemPage structure accordingly. 1328** 1329** Only the following combinations are supported. Anything different 1330** indicates a corrupt database files: 1331** 1332** PTF_ZERODATA 1333** PTF_ZERODATA | PTF_LEAF 1334** PTF_LEAFDATA | PTF_INTKEY 1335** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF 1336*/ 1337static int decodeFlags(MemPage *pPage, int flagByte){ 1338 BtShared *pBt; /* A copy of pPage->pBt */ 1339 1340 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 1341 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1342 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); 1343 flagByte &= ~PTF_LEAF; 1344 pPage->childPtrSize = 4-4*pPage->leaf; 1345 pBt = pPage->pBt; 1346 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 1347 pPage->intKey = 1; 1348 pPage->hasData = pPage->leaf; 1349 pPage->maxLocal = pBt->maxLeaf; 1350 pPage->minLocal = pBt->minLeaf; 1351 }else if( flagByte==PTF_ZERODATA ){ 1352 pPage->intKey = 0; 1353 pPage->hasData = 0; 1354 pPage->maxLocal = pBt->maxLocal; 1355 pPage->minLocal = pBt->minLocal; 1356 }else{ 1357 return SQLITE_CORRUPT_BKPT; 1358 } 1359 return SQLITE_OK; 1360} 1361 1362/* 1363** Initialize the auxiliary information for a disk block. 1364** 1365** Return SQLITE_OK on success. If we see that the page does 1366** not contain a well-formed database page, then return 1367** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 1368** guarantee that the page is well-formed. It only shows that 1369** we failed to detect any corruption. 1370*/ 1371static int btreeInitPage(MemPage *pPage){ 1372 1373 assert( pPage->pBt!=0 ); 1374 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1375 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 1376 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 1377 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 1378 1379 if( !pPage->isInit ){ 1380 u16 pc; /* Address of a freeblock within pPage->aData[] */ 1381 u8 hdr; /* Offset to beginning of page header */ 1382 u8 *data; /* Equal to pPage->aData */ 1383 BtShared *pBt; /* The main btree structure */ 1384 int usableSize; /* Amount of usable space on each page */ 1385 u16 cellOffset; /* Offset from start of page to first cell pointer */ 1386 int nFree; /* Number of unused bytes on the page */ 1387 int top; /* First byte of the cell content area */ 1388 int iCellFirst; /* First allowable cell or freeblock offset */ 1389 int iCellLast; /* Last possible cell or freeblock offset */ 1390 1391 pBt = pPage->pBt; 1392 1393 hdr = pPage->hdrOffset; 1394 data = pPage->aData; 1395 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; 1396 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 1397 pPage->maskPage = (u16)(pBt->pageSize - 1); 1398 pPage->nOverflow = 0; 1399 usableSize = pBt->usableSize; 1400 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; 1401 top = get2byteNotZero(&data[hdr+5]); 1402 pPage->nCell = get2byte(&data[hdr+3]); 1403 if( pPage->nCell>MX_CELL(pBt) ){ 1404 /* To many cells for a single page. The page must be corrupt */ 1405 return SQLITE_CORRUPT_BKPT; 1406 } 1407 testcase( pPage->nCell==MX_CELL(pBt) ); 1408 1409 /* A malformed database page might cause us to read past the end 1410 ** of page when parsing a cell. 1411 ** 1412 ** The following block of code checks early to see if a cell extends 1413 ** past the end of a page boundary and causes SQLITE_CORRUPT to be 1414 ** returned if it does. 1415 */ 1416 iCellFirst = cellOffset + 2*pPage->nCell; 1417 iCellLast = usableSize - 4; 1418#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) 1419 { 1420 int i; /* Index into the cell pointer array */ 1421 int sz; /* Size of a cell */ 1422 1423 if( !pPage->leaf ) iCellLast--; 1424 for(i=0; i<pPage->nCell; i++){ 1425 pc = get2byte(&data[cellOffset+i*2]); 1426 testcase( pc==iCellFirst ); 1427 testcase( pc==iCellLast ); 1428 if( pc<iCellFirst || pc>iCellLast ){ 1429 return SQLITE_CORRUPT_BKPT; 1430 } 1431 sz = cellSizePtr(pPage, &data[pc]); 1432 testcase( pc+sz==usableSize ); 1433 if( pc+sz>usableSize ){ 1434 return SQLITE_CORRUPT_BKPT; 1435 } 1436 } 1437 if( !pPage->leaf ) iCellLast++; 1438 } 1439#endif 1440 1441 /* Compute the total free space on the page */ 1442 pc = get2byte(&data[hdr+1]); 1443 nFree = data[hdr+7] + top; 1444 while( pc>0 ){ 1445 u16 next, size; 1446 if( pc<iCellFirst || pc>iCellLast ){ 1447 /* Start of free block is off the page */ 1448 return SQLITE_CORRUPT_BKPT; 1449 } 1450 next = get2byte(&data[pc]); 1451 size = get2byte(&data[pc+2]); 1452 if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){ 1453 /* Free blocks must be in ascending order. And the last byte of 1454 ** the free-block must lie on the database page. */ 1455 return SQLITE_CORRUPT_BKPT; 1456 } 1457 nFree = nFree + size; 1458 pc = next; 1459 } 1460 1461 /* At this point, nFree contains the sum of the offset to the start 1462 ** of the cell-content area plus the number of free bytes within 1463 ** the cell-content area. If this is greater than the usable-size 1464 ** of the page, then the page must be corrupted. This check also 1465 ** serves to verify that the offset to the start of the cell-content 1466 ** area, according to the page header, lies within the page. 1467 */ 1468 if( nFree>usableSize ){ 1469 return SQLITE_CORRUPT_BKPT; 1470 } 1471 pPage->nFree = (u16)(nFree - iCellFirst); 1472 pPage->isInit = 1; 1473 } 1474 return SQLITE_OK; 1475} 1476 1477/* 1478** Set up a raw page so that it looks like a database page holding 1479** no entries. 1480*/ 1481static void zeroPage(MemPage *pPage, int flags){ 1482 unsigned char *data = pPage->aData; 1483 BtShared *pBt = pPage->pBt; 1484 u8 hdr = pPage->hdrOffset; 1485 u16 first; 1486 1487 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); 1488 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1489 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 1490 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 1491 assert( sqlite3_mutex_held(pBt->mutex) ); 1492 if( pBt->secureDelete ){ 1493 memset(&data[hdr], 0, pBt->usableSize - hdr); 1494 } 1495 data[hdr] = (char)flags; 1496 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0); 1497 memset(&data[hdr+1], 0, 4); 1498 data[hdr+7] = 0; 1499 put2byte(&data[hdr+5], pBt->usableSize); 1500 pPage->nFree = (u16)(pBt->usableSize - first); 1501 decodeFlags(pPage, flags); 1502 pPage->hdrOffset = hdr; 1503 pPage->cellOffset = first; 1504 pPage->nOverflow = 0; 1505 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 1506 pPage->maskPage = (u16)(pBt->pageSize - 1); 1507 pPage->nCell = 0; 1508 pPage->isInit = 1; 1509} 1510 1511 1512/* 1513** Convert a DbPage obtained from the pager into a MemPage used by 1514** the btree layer. 1515*/ 1516static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 1517 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 1518 pPage->aData = sqlite3PagerGetData(pDbPage); 1519 pPage->pDbPage = pDbPage; 1520 pPage->pBt = pBt; 1521 pPage->pgno = pgno; 1522 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; 1523 return pPage; 1524} 1525 1526/* 1527** Get a page from the pager. Initialize the MemPage.pBt and 1528** MemPage.aData elements if needed. 1529** 1530** If the noContent flag is set, it means that we do not care about 1531** the content of the page at this time. So do not go to the disk 1532** to fetch the content. Just fill in the content with zeros for now. 1533** If in the future we call sqlite3PagerWrite() on this page, that 1534** means we have started to be concerned about content and the disk 1535** read should occur at that point. 1536*/ 1537static int btreeGetPage( 1538 BtShared *pBt, /* The btree */ 1539 Pgno pgno, /* Number of the page to fetch */ 1540 MemPage **ppPage, /* Return the page in this parameter */ 1541 int noContent /* Do not load page content if true */ 1542){ 1543 int rc; 1544 DbPage *pDbPage; 1545 1546 assert( sqlite3_mutex_held(pBt->mutex) ); 1547 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); 1548 if( rc ) return rc; 1549 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 1550 return SQLITE_OK; 1551} 1552 1553/* 1554** Retrieve a page from the pager cache. If the requested page is not 1555** already in the pager cache return NULL. Initialize the MemPage.pBt and 1556** MemPage.aData elements if needed. 1557*/ 1558static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 1559 DbPage *pDbPage; 1560 assert( sqlite3_mutex_held(pBt->mutex) ); 1561 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 1562 if( pDbPage ){ 1563 return btreePageFromDbPage(pDbPage, pgno, pBt); 1564 } 1565 return 0; 1566} 1567 1568/* 1569** Return the size of the database file in pages. If there is any kind of 1570** error, return ((unsigned int)-1). 1571*/ 1572static Pgno btreePagecount(BtShared *pBt){ 1573 return pBt->nPage; 1574} 1575u32 sqlite3BtreeLastPage(Btree *p){ 1576 assert( sqlite3BtreeHoldsMutex(p) ); 1577 assert( ((p->pBt->nPage)&0x8000000)==0 ); 1578 return (int)btreePagecount(p->pBt); 1579} 1580 1581/* 1582** Get a page from the pager and initialize it. This routine is just a 1583** convenience wrapper around separate calls to btreeGetPage() and 1584** btreeInitPage(). 1585** 1586** If an error occurs, then the value *ppPage is set to is undefined. It 1587** may remain unchanged, or it may be set to an invalid value. 1588*/ 1589static int getAndInitPage( 1590 BtShared *pBt, /* The database file */ 1591 Pgno pgno, /* Number of the page to get */ 1592 MemPage **ppPage /* Write the page pointer here */ 1593){ 1594 int rc; 1595 assert( sqlite3_mutex_held(pBt->mutex) ); 1596 1597 if( pgno>btreePagecount(pBt) ){ 1598 rc = SQLITE_CORRUPT_BKPT; 1599 }else{ 1600 rc = btreeGetPage(pBt, pgno, ppPage, 0); 1601 if( rc==SQLITE_OK ){ 1602 rc = btreeInitPage(*ppPage); 1603 if( rc!=SQLITE_OK ){ 1604 releasePage(*ppPage); 1605 } 1606 } 1607 } 1608 1609 testcase( pgno==0 ); 1610 assert( pgno!=0 || rc==SQLITE_CORRUPT ); 1611 return rc; 1612} 1613 1614/* 1615** Release a MemPage. This should be called once for each prior 1616** call to btreeGetPage. 1617*/ 1618static void releasePage(MemPage *pPage){ 1619 if( pPage ){ 1620 assert( pPage->aData ); 1621 assert( pPage->pBt ); 1622 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 1623 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 1624 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1625 sqlite3PagerUnref(pPage->pDbPage); 1626 } 1627} 1628 1629/* 1630** During a rollback, when the pager reloads information into the cache 1631** so that the cache is restored to its original state at the start of 1632** the transaction, for each page restored this routine is called. 1633** 1634** This routine needs to reset the extra data section at the end of the 1635** page to agree with the restored data. 1636*/ 1637static void pageReinit(DbPage *pData){ 1638 MemPage *pPage; 1639 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 1640 assert( sqlite3PagerPageRefcount(pData)>0 ); 1641 if( pPage->isInit ){ 1642 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 1643 pPage->isInit = 0; 1644 if( sqlite3PagerPageRefcount(pData)>1 ){ 1645 /* pPage might not be a btree page; it might be an overflow page 1646 ** or ptrmap page or a free page. In those cases, the following 1647 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 1648 ** But no harm is done by this. And it is very important that 1649 ** btreeInitPage() be called on every btree page so we make 1650 ** the call for every page that comes in for re-initing. */ 1651 btreeInitPage(pPage); 1652 } 1653 } 1654} 1655 1656/* 1657** Invoke the busy handler for a btree. 1658*/ 1659static int btreeInvokeBusyHandler(void *pArg){ 1660 BtShared *pBt = (BtShared*)pArg; 1661 assert( pBt->db ); 1662 assert( sqlite3_mutex_held(pBt->db->mutex) ); 1663 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 1664} 1665 1666/* 1667** Open a database file. 1668** 1669** zFilename is the name of the database file. If zFilename is NULL 1670** then an ephemeral database is created. The ephemeral database might 1671** be exclusively in memory, or it might use a disk-based memory cache. 1672** Either way, the ephemeral database will be automatically deleted 1673** when sqlite3BtreeClose() is called. 1674** 1675** If zFilename is ":memory:" then an in-memory database is created 1676** that is automatically destroyed when it is closed. 1677** 1678** The "flags" parameter is a bitmask that might contain bits 1679** BTREE_OMIT_JOURNAL and/or BTREE_NO_READLOCK. The BTREE_NO_READLOCK 1680** bit is also set if the SQLITE_NoReadlock flags is set in db->flags. 1681** These flags are passed through into sqlite3PagerOpen() and must 1682** be the same values as PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK. 1683** 1684** If the database is already opened in the same database connection 1685** and we are in shared cache mode, then the open will fail with an 1686** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 1687** objects in the same database connection since doing so will lead 1688** to problems with locking. 1689*/ 1690int sqlite3BtreeOpen( 1691 const char *zFilename, /* Name of the file containing the BTree database */ 1692 sqlite3 *db, /* Associated database handle */ 1693 Btree **ppBtree, /* Pointer to new Btree object written here */ 1694 int flags, /* Options */ 1695 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 1696){ 1697 sqlite3_vfs *pVfs; /* The VFS to use for this btree */ 1698 BtShared *pBt = 0; /* Shared part of btree structure */ 1699 Btree *p; /* Handle to return */ 1700 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 1701 int rc = SQLITE_OK; /* Result code from this function */ 1702 u8 nReserve; /* Byte of unused space on each page */ 1703 unsigned char zDbHeader[100]; /* Database header content */ 1704 1705 /* True if opening an ephemeral, temporary database */ 1706 const int isTempDb = zFilename==0 || zFilename[0]==0; 1707 1708 /* Set the variable isMemdb to true for an in-memory database, or 1709 ** false for a file-based database. 1710 */ 1711#ifdef SQLITE_OMIT_MEMORYDB 1712 const int isMemdb = 0; 1713#else 1714 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 1715 || (isTempDb && sqlite3TempInMemory(db)); 1716#endif 1717 1718 assert( db!=0 ); 1719 assert( sqlite3_mutex_held(db->mutex) ); 1720 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 1721 1722 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 1723 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 1724 1725 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 1726 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 1727 1728 if( db->flags & SQLITE_NoReadlock ){ 1729 flags |= BTREE_NO_READLOCK; 1730 } 1731 if( isMemdb ){ 1732 flags |= BTREE_MEMORY; 1733 } 1734 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 1735 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 1736 } 1737 pVfs = db->pVfs; 1738 p = sqlite3MallocZero(sizeof(Btree)); 1739 if( !p ){ 1740 return SQLITE_NOMEM; 1741 } 1742 p->inTrans = TRANS_NONE; 1743 p->db = db; 1744#ifndef SQLITE_OMIT_SHARED_CACHE 1745 p->lock.pBtree = p; 1746 p->lock.iTable = 1; 1747#endif 1748 1749#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1750 /* 1751 ** If this Btree is a candidate for shared cache, try to find an 1752 ** existing BtShared object that we can share with 1753 */ 1754 if( isMemdb==0 && isTempDb==0 ){ 1755 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 1756 int nFullPathname = pVfs->mxPathname+1; 1757 char *zFullPathname = sqlite3Malloc(nFullPathname); 1758 sqlite3_mutex *mutexShared; 1759 p->sharable = 1; 1760 if( !zFullPathname ){ 1761 sqlite3_free(p); 1762 return SQLITE_NOMEM; 1763 } 1764 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); 1765 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 1766 sqlite3_mutex_enter(mutexOpen); 1767 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1768 sqlite3_mutex_enter(mutexShared); 1769 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 1770 assert( pBt->nRef>0 ); 1771 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) 1772 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 1773 int iDb; 1774 for(iDb=db->nDb-1; iDb>=0; iDb--){ 1775 Btree *pExisting = db->aDb[iDb].pBt; 1776 if( pExisting && pExisting->pBt==pBt ){ 1777 sqlite3_mutex_leave(mutexShared); 1778 sqlite3_mutex_leave(mutexOpen); 1779 sqlite3_free(zFullPathname); 1780 sqlite3_free(p); 1781 return SQLITE_CONSTRAINT; 1782 } 1783 } 1784 p->pBt = pBt; 1785 pBt->nRef++; 1786 break; 1787 } 1788 } 1789 sqlite3_mutex_leave(mutexShared); 1790 sqlite3_free(zFullPathname); 1791 } 1792#ifdef SQLITE_DEBUG 1793 else{ 1794 /* In debug mode, we mark all persistent databases as sharable 1795 ** even when they are not. This exercises the locking code and 1796 ** gives more opportunity for asserts(sqlite3_mutex_held()) 1797 ** statements to find locking problems. 1798 */ 1799 p->sharable = 1; 1800 } 1801#endif 1802 } 1803#endif 1804 if( pBt==0 ){ 1805 /* 1806 ** The following asserts make sure that structures used by the btree are 1807 ** the right size. This is to guard against size changes that result 1808 ** when compiling on a different architecture. 1809 */ 1810 assert( sizeof(i64)==8 || sizeof(i64)==4 ); 1811 assert( sizeof(u64)==8 || sizeof(u64)==4 ); 1812 assert( sizeof(u32)==4 ); 1813 assert( sizeof(u16)==2 ); 1814 assert( sizeof(Pgno)==4 ); 1815 1816 pBt = sqlite3MallocZero( sizeof(*pBt) ); 1817 if( pBt==0 ){ 1818 rc = SQLITE_NOMEM; 1819 goto btree_open_out; 1820 } 1821 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 1822 EXTRA_SIZE, flags, vfsFlags, pageReinit); 1823 if( rc==SQLITE_OK ){ 1824 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 1825 } 1826 if( rc!=SQLITE_OK ){ 1827 goto btree_open_out; 1828 } 1829 pBt->openFlags = (u8)flags; 1830 pBt->db = db; 1831 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 1832 p->pBt = pBt; 1833 1834 pBt->pCursor = 0; 1835 pBt->pPage1 = 0; 1836 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); 1837#ifdef SQLITE_SECURE_DELETE 1838 pBt->secureDelete = 1; 1839#endif 1840 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 1841 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 1842 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 1843 pBt->pageSize = 0; 1844#ifndef SQLITE_OMIT_AUTOVACUUM 1845 /* If the magic name ":memory:" will create an in-memory database, then 1846 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 1847 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 1848 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 1849 ** regular file-name. In this case the auto-vacuum applies as per normal. 1850 */ 1851 if( zFilename && !isMemdb ){ 1852 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 1853 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 1854 } 1855#endif 1856 nReserve = 0; 1857 }else{ 1858 nReserve = zDbHeader[20]; 1859 pBt->pageSizeFixed = 1; 1860#ifndef SQLITE_OMIT_AUTOVACUUM 1861 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 1862 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 1863#endif 1864 } 1865 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 1866 if( rc ) goto btree_open_out; 1867 pBt->usableSize = pBt->pageSize - nReserve; 1868 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 1869 1870#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1871 /* Add the new BtShared object to the linked list sharable BtShareds. 1872 */ 1873 if( p->sharable ){ 1874 sqlite3_mutex *mutexShared; 1875 pBt->nRef = 1; 1876 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1877 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 1878 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 1879 if( pBt->mutex==0 ){ 1880 rc = SQLITE_NOMEM; 1881 db->mallocFailed = 0; 1882 goto btree_open_out; 1883 } 1884 } 1885 sqlite3_mutex_enter(mutexShared); 1886 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 1887 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 1888 sqlite3_mutex_leave(mutexShared); 1889 } 1890#endif 1891 } 1892 1893#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 1894 /* If the new Btree uses a sharable pBtShared, then link the new 1895 ** Btree into the list of all sharable Btrees for the same connection. 1896 ** The list is kept in ascending order by pBt address. 1897 */ 1898 if( p->sharable ){ 1899 int i; 1900 Btree *pSib; 1901 for(i=0; i<db->nDb; i++){ 1902 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 1903 while( pSib->pPrev ){ pSib = pSib->pPrev; } 1904 if( p->pBt<pSib->pBt ){ 1905 p->pNext = pSib; 1906 p->pPrev = 0; 1907 pSib->pPrev = p; 1908 }else{ 1909 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ 1910 pSib = pSib->pNext; 1911 } 1912 p->pNext = pSib->pNext; 1913 p->pPrev = pSib; 1914 if( p->pNext ){ 1915 p->pNext->pPrev = p; 1916 } 1917 pSib->pNext = p; 1918 } 1919 break; 1920 } 1921 } 1922 } 1923#endif 1924 *ppBtree = p; 1925 1926btree_open_out: 1927 if( rc!=SQLITE_OK ){ 1928 if( pBt && pBt->pPager ){ 1929 sqlite3PagerClose(pBt->pPager); 1930 } 1931 sqlite3_free(pBt); 1932 sqlite3_free(p); 1933 *ppBtree = 0; 1934 }else{ 1935 /* If the B-Tree was successfully opened, set the pager-cache size to the 1936 ** default value. Except, when opening on an existing shared pager-cache, 1937 ** do not change the pager-cache size. 1938 */ 1939 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 1940 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE); 1941 } 1942 } 1943 if( mutexOpen ){ 1944 assert( sqlite3_mutex_held(mutexOpen) ); 1945 sqlite3_mutex_leave(mutexOpen); 1946 } 1947 return rc; 1948} 1949 1950/* 1951** Decrement the BtShared.nRef counter. When it reaches zero, 1952** remove the BtShared structure from the sharing list. Return 1953** true if the BtShared.nRef counter reaches zero and return 1954** false if it is still positive. 1955*/ 1956static int removeFromSharingList(BtShared *pBt){ 1957#ifndef SQLITE_OMIT_SHARED_CACHE 1958 sqlite3_mutex *pMaster; 1959 BtShared *pList; 1960 int removed = 0; 1961 1962 assert( sqlite3_mutex_notheld(pBt->mutex) ); 1963 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); 1964 sqlite3_mutex_enter(pMaster); 1965 pBt->nRef--; 1966 if( pBt->nRef<=0 ){ 1967 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 1968 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 1969 }else{ 1970 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 1971 while( ALWAYS(pList) && pList->pNext!=pBt ){ 1972 pList=pList->pNext; 1973 } 1974 if( ALWAYS(pList) ){ 1975 pList->pNext = pBt->pNext; 1976 } 1977 } 1978 if( SQLITE_THREADSAFE ){ 1979 sqlite3_mutex_free(pBt->mutex); 1980 } 1981 removed = 1; 1982 } 1983 sqlite3_mutex_leave(pMaster); 1984 return removed; 1985#else 1986 return 1; 1987#endif 1988} 1989 1990/* 1991** Make sure pBt->pTmpSpace points to an allocation of 1992** MX_CELL_SIZE(pBt) bytes. 1993*/ 1994static void allocateTempSpace(BtShared *pBt){ 1995 if( !pBt->pTmpSpace ){ 1996 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 1997 } 1998} 1999 2000/* 2001** Free the pBt->pTmpSpace allocation 2002*/ 2003static void freeTempSpace(BtShared *pBt){ 2004 sqlite3PageFree( pBt->pTmpSpace); 2005 pBt->pTmpSpace = 0; 2006} 2007 2008/* 2009** Close an open database and invalidate all cursors. 2010*/ 2011int sqlite3BtreeClose(Btree *p){ 2012 BtShared *pBt = p->pBt; 2013 BtCursor *pCur; 2014 2015 /* Close all cursors opened via this handle. */ 2016 assert( sqlite3_mutex_held(p->db->mutex) ); 2017 sqlite3BtreeEnter(p); 2018 pCur = pBt->pCursor; 2019 while( pCur ){ 2020 BtCursor *pTmp = pCur; 2021 pCur = pCur->pNext; 2022 if( pTmp->pBtree==p ){ 2023 sqlite3BtreeCloseCursor(pTmp); 2024 } 2025 } 2026 2027 /* Rollback any active transaction and free the handle structure. 2028 ** The call to sqlite3BtreeRollback() drops any table-locks held by 2029 ** this handle. 2030 */ 2031 sqlite3BtreeRollback(p); 2032 sqlite3BtreeLeave(p); 2033 2034 /* If there are still other outstanding references to the shared-btree 2035 ** structure, return now. The remainder of this procedure cleans 2036 ** up the shared-btree. 2037 */ 2038 assert( p->wantToLock==0 && p->locked==0 ); 2039 if( !p->sharable || removeFromSharingList(pBt) ){ 2040 /* The pBt is no longer on the sharing list, so we can access 2041 ** it without having to hold the mutex. 2042 ** 2043 ** Clean out and delete the BtShared object. 2044 */ 2045 assert( !pBt->pCursor ); 2046 sqlite3PagerClose(pBt->pPager); 2047 if( pBt->xFreeSchema && pBt->pSchema ){ 2048 pBt->xFreeSchema(pBt->pSchema); 2049 } 2050 sqlite3DbFree(0, pBt->pSchema); 2051 freeTempSpace(pBt); 2052 sqlite3_free(pBt); 2053 } 2054 2055#ifndef SQLITE_OMIT_SHARED_CACHE 2056 assert( p->wantToLock==0 ); 2057 assert( p->locked==0 ); 2058 if( p->pPrev ) p->pPrev->pNext = p->pNext; 2059 if( p->pNext ) p->pNext->pPrev = p->pPrev; 2060#endif 2061 2062 sqlite3_free(p); 2063 return SQLITE_OK; 2064} 2065 2066/* 2067** Change the limit on the number of pages allowed in the cache. 2068** 2069** The maximum number of cache pages is set to the absolute 2070** value of mxPage. If mxPage is negative, the pager will 2071** operate asynchronously - it will not stop to do fsync()s 2072** to insure data is written to the disk surface before 2073** continuing. Transactions still work if synchronous is off, 2074** and the database cannot be corrupted if this program 2075** crashes. But if the operating system crashes or there is 2076** an abrupt power failure when synchronous is off, the database 2077** could be left in an inconsistent and unrecoverable state. 2078** Synchronous is on by default so database corruption is not 2079** normally a worry. 2080*/ 2081int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 2082 BtShared *pBt = p->pBt; 2083 assert( sqlite3_mutex_held(p->db->mutex) ); 2084 sqlite3BtreeEnter(p); 2085 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 2086 sqlite3BtreeLeave(p); 2087 return SQLITE_OK; 2088} 2089 2090/* 2091** Change the way data is synced to disk in order to increase or decrease 2092** how well the database resists damage due to OS crashes and power 2093** failures. Level 1 is the same as asynchronous (no syncs() occur and 2094** there is a high probability of damage) Level 2 is the default. There 2095** is a very low but non-zero probability of damage. Level 3 reduces the 2096** probability of damage to near zero but with a write performance reduction. 2097*/ 2098#ifndef SQLITE_OMIT_PAGER_PRAGMAS 2099int sqlite3BtreeSetSafetyLevel( 2100 Btree *p, /* The btree to set the safety level on */ 2101 int level, /* PRAGMA synchronous. 1=OFF, 2=NORMAL, 3=FULL */ 2102 int fullSync, /* PRAGMA fullfsync. */ 2103 int ckptFullSync /* PRAGMA checkpoint_fullfync */ 2104){ 2105 BtShared *pBt = p->pBt; 2106 assert( sqlite3_mutex_held(p->db->mutex) ); 2107 assert( level>=1 && level<=3 ); 2108 sqlite3BtreeEnter(p); 2109 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync, ckptFullSync); 2110 sqlite3BtreeLeave(p); 2111 return SQLITE_OK; 2112} 2113#endif 2114 2115/* 2116** Return TRUE if the given btree is set to safety level 1. In other 2117** words, return TRUE if no sync() occurs on the disk files. 2118*/ 2119int sqlite3BtreeSyncDisabled(Btree *p){ 2120 BtShared *pBt = p->pBt; 2121 int rc; 2122 assert( sqlite3_mutex_held(p->db->mutex) ); 2123 sqlite3BtreeEnter(p); 2124 assert( pBt && pBt->pPager ); 2125 rc = sqlite3PagerNosync(pBt->pPager); 2126 sqlite3BtreeLeave(p); 2127 return rc; 2128} 2129 2130/* 2131** Change the default pages size and the number of reserved bytes per page. 2132** Or, if the page size has already been fixed, return SQLITE_READONLY 2133** without changing anything. 2134** 2135** The page size must be a power of 2 between 512 and 65536. If the page 2136** size supplied does not meet this constraint then the page size is not 2137** changed. 2138** 2139** Page sizes are constrained to be a power of two so that the region 2140** of the database file used for locking (beginning at PENDING_BYTE, 2141** the first byte past the 1GB boundary, 0x40000000) needs to occur 2142** at the beginning of a page. 2143** 2144** If parameter nReserve is less than zero, then the number of reserved 2145** bytes per page is left unchanged. 2146** 2147** If the iFix!=0 then the pageSizeFixed flag is set so that the page size 2148** and autovacuum mode can no longer be changed. 2149*/ 2150int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 2151 int rc = SQLITE_OK; 2152 BtShared *pBt = p->pBt; 2153 assert( nReserve>=-1 && nReserve<=255 ); 2154 sqlite3BtreeEnter(p); 2155 if( pBt->pageSizeFixed ){ 2156 sqlite3BtreeLeave(p); 2157 return SQLITE_READONLY; 2158 } 2159 if( nReserve<0 ){ 2160 nReserve = pBt->pageSize - pBt->usableSize; 2161 } 2162 assert( nReserve>=0 && nReserve<=255 ); 2163 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 2164 ((pageSize-1)&pageSize)==0 ){ 2165 assert( (pageSize & 7)==0 ); 2166 assert( !pBt->pPage1 && !pBt->pCursor ); 2167 pBt->pageSize = (u32)pageSize; 2168 freeTempSpace(pBt); 2169 } 2170 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 2171 pBt->usableSize = pBt->pageSize - (u16)nReserve; 2172 if( iFix ) pBt->pageSizeFixed = 1; 2173 sqlite3BtreeLeave(p); 2174 return rc; 2175} 2176 2177/* 2178** Return the currently defined page size 2179*/ 2180int sqlite3BtreeGetPageSize(Btree *p){ 2181 return p->pBt->pageSize; 2182} 2183 2184#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) 2185/* 2186** Return the number of bytes of space at the end of every page that 2187** are intentually left unused. This is the "reserved" space that is 2188** sometimes used by extensions. 2189*/ 2190int sqlite3BtreeGetReserve(Btree *p){ 2191 int n; 2192 sqlite3BtreeEnter(p); 2193 n = p->pBt->pageSize - p->pBt->usableSize; 2194 sqlite3BtreeLeave(p); 2195 return n; 2196} 2197 2198/* 2199** Set the maximum page count for a database if mxPage is positive. 2200** No changes are made if mxPage is 0 or negative. 2201** Regardless of the value of mxPage, return the maximum page count. 2202*/ 2203int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ 2204 int n; 2205 sqlite3BtreeEnter(p); 2206 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 2207 sqlite3BtreeLeave(p); 2208 return n; 2209} 2210 2211/* 2212** Set the secureDelete flag if newFlag is 0 or 1. If newFlag is -1, 2213** then make no changes. Always return the value of the secureDelete 2214** setting after the change. 2215*/ 2216int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 2217 int b; 2218 if( p==0 ) return 0; 2219 sqlite3BtreeEnter(p); 2220 if( newFlag>=0 ){ 2221 p->pBt->secureDelete = (newFlag!=0) ? 1 : 0; 2222 } 2223 b = p->pBt->secureDelete; 2224 sqlite3BtreeLeave(p); 2225 return b; 2226} 2227#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ 2228 2229/* 2230** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 2231** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 2232** is disabled. The default value for the auto-vacuum property is 2233** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 2234*/ 2235int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 2236#ifdef SQLITE_OMIT_AUTOVACUUM 2237 return SQLITE_READONLY; 2238#else 2239 BtShared *pBt = p->pBt; 2240 int rc = SQLITE_OK; 2241 u8 av = (u8)autoVacuum; 2242 2243 sqlite3BtreeEnter(p); 2244 if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){ 2245 rc = SQLITE_READONLY; 2246 }else{ 2247 pBt->autoVacuum = av ?1:0; 2248 pBt->incrVacuum = av==2 ?1:0; 2249 } 2250 sqlite3BtreeLeave(p); 2251 return rc; 2252#endif 2253} 2254 2255/* 2256** Return the value of the 'auto-vacuum' property. If auto-vacuum is 2257** enabled 1 is returned. Otherwise 0. 2258*/ 2259int sqlite3BtreeGetAutoVacuum(Btree *p){ 2260#ifdef SQLITE_OMIT_AUTOVACUUM 2261 return BTREE_AUTOVACUUM_NONE; 2262#else 2263 int rc; 2264 sqlite3BtreeEnter(p); 2265 rc = ( 2266 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 2267 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 2268 BTREE_AUTOVACUUM_INCR 2269 ); 2270 sqlite3BtreeLeave(p); 2271 return rc; 2272#endif 2273} 2274 2275 2276/* 2277** Get a reference to pPage1 of the database file. This will 2278** also acquire a readlock on that file. 2279** 2280** SQLITE_OK is returned on success. If the file is not a 2281** well-formed database file, then SQLITE_CORRUPT is returned. 2282** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 2283** is returned if we run out of memory. 2284*/ 2285static int lockBtree(BtShared *pBt){ 2286 int rc; /* Result code from subfunctions */ 2287 MemPage *pPage1; /* Page 1 of the database file */ 2288 int nPage; /* Number of pages in the database */ 2289 int nPageFile = 0; /* Number of pages in the database file */ 2290 int nPageHeader; /* Number of pages in the database according to hdr */ 2291 2292 assert( sqlite3_mutex_held(pBt->mutex) ); 2293 assert( pBt->pPage1==0 ); 2294 rc = sqlite3PagerSharedLock(pBt->pPager); 2295 if( rc!=SQLITE_OK ) return rc; 2296 rc = btreeGetPage(pBt, 1, &pPage1, 0); 2297 if( rc!=SQLITE_OK ) return rc; 2298 2299 /* Do some checking to help insure the file we opened really is 2300 ** a valid database file. 2301 */ 2302 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData); 2303 sqlite3PagerPagecount(pBt->pPager, &nPageFile); 2304 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 2305 nPage = nPageFile; 2306 } 2307 if( nPage>0 ){ 2308 u32 pageSize; 2309 u32 usableSize; 2310 u8 *page1 = pPage1->aData; 2311 rc = SQLITE_NOTADB; 2312 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 2313 goto page1_init_failed; 2314 } 2315 2316#ifdef SQLITE_OMIT_WAL 2317 if( page1[18]>1 ){ 2318 pBt->readOnly = 1; 2319 } 2320 if( page1[19]>1 ){ 2321 goto page1_init_failed; 2322 } 2323#else 2324 if( page1[18]>2 ){ 2325 pBt->readOnly = 1; 2326 } 2327 if( page1[19]>2 ){ 2328 goto page1_init_failed; 2329 } 2330 2331 /* If the write version is set to 2, this database should be accessed 2332 ** in WAL mode. If the log is not already open, open it now. Then 2333 ** return SQLITE_OK and return without populating BtShared.pPage1. 2334 ** The caller detects this and calls this function again. This is 2335 ** required as the version of page 1 currently in the page1 buffer 2336 ** may not be the latest version - there may be a newer one in the log 2337 ** file. 2338 */ 2339 if( page1[19]==2 && pBt->doNotUseWAL==0 ){ 2340 int isOpen = 0; 2341 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 2342 if( rc!=SQLITE_OK ){ 2343 goto page1_init_failed; 2344 }else if( isOpen==0 ){ 2345 releasePage(pPage1); 2346 return SQLITE_OK; 2347 } 2348 rc = SQLITE_NOTADB; 2349 } 2350#endif 2351 2352 /* The maximum embedded fraction must be exactly 25%. And the minimum 2353 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. 2354 ** The original design allowed these amounts to vary, but as of 2355 ** version 3.6.0, we require them to be fixed. 2356 */ 2357 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 2358 goto page1_init_failed; 2359 } 2360 pageSize = (page1[16]<<8) | (page1[17]<<16); 2361 if( ((pageSize-1)&pageSize)!=0 2362 || pageSize>SQLITE_MAX_PAGE_SIZE 2363 || pageSize<=256 2364 ){ 2365 goto page1_init_failed; 2366 } 2367 assert( (pageSize & 7)==0 ); 2368 usableSize = pageSize - page1[20]; 2369 if( (u32)pageSize!=pBt->pageSize ){ 2370 /* After reading the first page of the database assuming a page size 2371 ** of BtShared.pageSize, we have discovered that the page-size is 2372 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 2373 ** zero and return SQLITE_OK. The caller will call this function 2374 ** again with the correct page-size. 2375 */ 2376 releasePage(pPage1); 2377 pBt->usableSize = usableSize; 2378 pBt->pageSize = pageSize; 2379 freeTempSpace(pBt); 2380 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 2381 pageSize-usableSize); 2382 return rc; 2383 } 2384 if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){ 2385 rc = SQLITE_CORRUPT_BKPT; 2386 goto page1_init_failed; 2387 } 2388 if( usableSize<480 ){ 2389 goto page1_init_failed; 2390 } 2391 pBt->pageSize = pageSize; 2392 pBt->usableSize = usableSize; 2393#ifndef SQLITE_OMIT_AUTOVACUUM 2394 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 2395 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 2396#endif 2397 } 2398 2399 /* maxLocal is the maximum amount of payload to store locally for 2400 ** a cell. Make sure it is small enough so that at least minFanout 2401 ** cells can will fit on one page. We assume a 10-byte page header. 2402 ** Besides the payload, the cell must store: 2403 ** 2-byte pointer to the cell 2404 ** 4-byte child pointer 2405 ** 9-byte nKey value 2406 ** 4-byte nData value 2407 ** 4-byte overflow page pointer 2408 ** So a cell consists of a 2-byte pointer, a header which is as much as 2409 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 2410 ** page pointer. 2411 */ 2412 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 2413 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 2414 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 2415 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 2416 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 2417 pBt->pPage1 = pPage1; 2418 pBt->nPage = nPage; 2419 return SQLITE_OK; 2420 2421page1_init_failed: 2422 releasePage(pPage1); 2423 pBt->pPage1 = 0; 2424 return rc; 2425} 2426 2427/* 2428** If there are no outstanding cursors and we are not in the middle 2429** of a transaction but there is a read lock on the database, then 2430** this routine unrefs the first page of the database file which 2431** has the effect of releasing the read lock. 2432** 2433** If there is a transaction in progress, this routine is a no-op. 2434*/ 2435static void unlockBtreeIfUnused(BtShared *pBt){ 2436 assert( sqlite3_mutex_held(pBt->mutex) ); 2437 assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE ); 2438 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 2439 assert( pBt->pPage1->aData ); 2440 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 2441 assert( pBt->pPage1->aData ); 2442 releasePage(pBt->pPage1); 2443 pBt->pPage1 = 0; 2444 } 2445} 2446 2447/* 2448** If pBt points to an empty file then convert that empty file 2449** into a new empty database by initializing the first page of 2450** the database. 2451*/ 2452static int newDatabase(BtShared *pBt){ 2453 MemPage *pP1; 2454 unsigned char *data; 2455 int rc; 2456 2457 assert( sqlite3_mutex_held(pBt->mutex) ); 2458 if( pBt->nPage>0 ){ 2459 return SQLITE_OK; 2460 } 2461 pP1 = pBt->pPage1; 2462 assert( pP1!=0 ); 2463 data = pP1->aData; 2464 rc = sqlite3PagerWrite(pP1->pDbPage); 2465 if( rc ) return rc; 2466 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 2467 assert( sizeof(zMagicHeader)==16 ); 2468 data[16] = (u8)((pBt->pageSize>>8)&0xff); 2469 data[17] = (u8)((pBt->pageSize>>16)&0xff); 2470 data[18] = 1; 2471 data[19] = 1; 2472 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 2473 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 2474 data[21] = 64; 2475 data[22] = 32; 2476 data[23] = 32; 2477 memset(&data[24], 0, 100-24); 2478 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 2479 pBt->pageSizeFixed = 1; 2480#ifndef SQLITE_OMIT_AUTOVACUUM 2481 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 2482 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 2483 put4byte(&data[36 + 4*4], pBt->autoVacuum); 2484 put4byte(&data[36 + 7*4], pBt->incrVacuum); 2485#endif 2486 pBt->nPage = 1; 2487 data[31] = 1; 2488 return SQLITE_OK; 2489} 2490 2491/* 2492** Attempt to start a new transaction. A write-transaction 2493** is started if the second argument is nonzero, otherwise a read- 2494** transaction. If the second argument is 2 or more and exclusive 2495** transaction is started, meaning that no other process is allowed 2496** to access the database. A preexisting transaction may not be 2497** upgraded to exclusive by calling this routine a second time - the 2498** exclusivity flag only works for a new transaction. 2499** 2500** A write-transaction must be started before attempting any 2501** changes to the database. None of the following routines 2502** will work unless a transaction is started first: 2503** 2504** sqlite3BtreeCreateTable() 2505** sqlite3BtreeCreateIndex() 2506** sqlite3BtreeClearTable() 2507** sqlite3BtreeDropTable() 2508** sqlite3BtreeInsert() 2509** sqlite3BtreeDelete() 2510** sqlite3BtreeUpdateMeta() 2511** 2512** If an initial attempt to acquire the lock fails because of lock contention 2513** and the database was previously unlocked, then invoke the busy handler 2514** if there is one. But if there was previously a read-lock, do not 2515** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 2516** returned when there is already a read-lock in order to avoid a deadlock. 2517** 2518** Suppose there are two processes A and B. A has a read lock and B has 2519** a reserved lock. B tries to promote to exclusive but is blocked because 2520** of A's read lock. A tries to promote to reserved but is blocked by B. 2521** One or the other of the two processes must give way or there can be 2522** no progress. By returning SQLITE_BUSY and not invoking the busy callback 2523** when A already has a read lock, we encourage A to give up and let B 2524** proceed. 2525*/ 2526int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ 2527 sqlite3 *pBlock = 0; 2528 BtShared *pBt = p->pBt; 2529 int rc = SQLITE_OK; 2530 2531 sqlite3BtreeEnter(p); 2532 btreeIntegrity(p); 2533 2534 /* If the btree is already in a write-transaction, or it 2535 ** is already in a read-transaction and a read-transaction 2536 ** is requested, this is a no-op. 2537 */ 2538 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 2539 goto trans_begun; 2540 } 2541 2542 /* Write transactions are not possible on a read-only database */ 2543 if( pBt->readOnly && wrflag ){ 2544 rc = SQLITE_READONLY; 2545 goto trans_begun; 2546 } 2547 2548#ifndef SQLITE_OMIT_SHARED_CACHE 2549 /* If another database handle has already opened a write transaction 2550 ** on this shared-btree structure and a second write transaction is 2551 ** requested, return SQLITE_LOCKED. 2552 */ 2553 if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){ 2554 pBlock = pBt->pWriter->db; 2555 }else if( wrflag>1 ){ 2556 BtLock *pIter; 2557 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 2558 if( pIter->pBtree!=p ){ 2559 pBlock = pIter->pBtree->db; 2560 break; 2561 } 2562 } 2563 } 2564 if( pBlock ){ 2565 sqlite3ConnectionBlocked(p->db, pBlock); 2566 rc = SQLITE_LOCKED_SHAREDCACHE; 2567 goto trans_begun; 2568 } 2569#endif 2570 2571 /* Any read-only or read-write transaction implies a read-lock on 2572 ** page 1. So if some other shared-cache client already has a write-lock 2573 ** on page 1, the transaction cannot be opened. */ 2574 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 2575 if( SQLITE_OK!=rc ) goto trans_begun; 2576 2577 pBt->initiallyEmpty = (u8)(pBt->nPage==0); 2578 do { 2579 /* Call lockBtree() until either pBt->pPage1 is populated or 2580 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 2581 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 2582 ** reading page 1 it discovers that the page-size of the database 2583 ** file is not pBt->pageSize. In this case lockBtree() will update 2584 ** pBt->pageSize to the page-size of the file on disk. 2585 */ 2586 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 2587 2588 if( rc==SQLITE_OK && wrflag ){ 2589 if( pBt->readOnly ){ 2590 rc = SQLITE_READONLY; 2591 }else{ 2592 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db)); 2593 if( rc==SQLITE_OK ){ 2594 rc = newDatabase(pBt); 2595 } 2596 } 2597 } 2598 2599 if( rc!=SQLITE_OK ){ 2600 unlockBtreeIfUnused(pBt); 2601 } 2602 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 2603 btreeInvokeBusyHandler(pBt) ); 2604 2605 if( rc==SQLITE_OK ){ 2606 if( p->inTrans==TRANS_NONE ){ 2607 pBt->nTransaction++; 2608#ifndef SQLITE_OMIT_SHARED_CACHE 2609 if( p->sharable ){ 2610 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 2611 p->lock.eLock = READ_LOCK; 2612 p->lock.pNext = pBt->pLock; 2613 pBt->pLock = &p->lock; 2614 } 2615#endif 2616 } 2617 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 2618 if( p->inTrans>pBt->inTransaction ){ 2619 pBt->inTransaction = p->inTrans; 2620 } 2621 if( wrflag ){ 2622 MemPage *pPage1 = pBt->pPage1; 2623#ifndef SQLITE_OMIT_SHARED_CACHE 2624 assert( !pBt->pWriter ); 2625 pBt->pWriter = p; 2626 pBt->isExclusive = (u8)(wrflag>1); 2627#endif 2628 2629 /* If the db-size header field is incorrect (as it may be if an old 2630 ** client has been writing the database file), update it now. Doing 2631 ** this sooner rather than later means the database size can safely 2632 ** re-read the database size from page 1 if a savepoint or transaction 2633 ** rollback occurs within the transaction. 2634 */ 2635 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 2636 rc = sqlite3PagerWrite(pPage1->pDbPage); 2637 if( rc==SQLITE_OK ){ 2638 put4byte(&pPage1->aData[28], pBt->nPage); 2639 } 2640 } 2641 } 2642 } 2643 2644 2645trans_begun: 2646 if( rc==SQLITE_OK && wrflag ){ 2647 /* This call makes sure that the pager has the correct number of 2648 ** open savepoints. If the second parameter is greater than 0 and 2649 ** the sub-journal is not already open, then it will be opened here. 2650 */ 2651 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 2652 } 2653 2654 btreeIntegrity(p); 2655 sqlite3BtreeLeave(p); 2656 return rc; 2657} 2658 2659#ifndef SQLITE_OMIT_AUTOVACUUM 2660 2661/* 2662** Set the pointer-map entries for all children of page pPage. Also, if 2663** pPage contains cells that point to overflow pages, set the pointer 2664** map entries for the overflow pages as well. 2665*/ 2666static int setChildPtrmaps(MemPage *pPage){ 2667 int i; /* Counter variable */ 2668 int nCell; /* Number of cells in page pPage */ 2669 int rc; /* Return code */ 2670 BtShared *pBt = pPage->pBt; 2671 u8 isInitOrig = pPage->isInit; 2672 Pgno pgno = pPage->pgno; 2673 2674 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2675 rc = btreeInitPage(pPage); 2676 if( rc!=SQLITE_OK ){ 2677 goto set_child_ptrmaps_out; 2678 } 2679 nCell = pPage->nCell; 2680 2681 for(i=0; i<nCell; i++){ 2682 u8 *pCell = findCell(pPage, i); 2683 2684 ptrmapPutOvflPtr(pPage, pCell, &rc); 2685 2686 if( !pPage->leaf ){ 2687 Pgno childPgno = get4byte(pCell); 2688 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 2689 } 2690 } 2691 2692 if( !pPage->leaf ){ 2693 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 2694 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 2695 } 2696 2697set_child_ptrmaps_out: 2698 pPage->isInit = isInitOrig; 2699 return rc; 2700} 2701 2702/* 2703** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 2704** that it points to iTo. Parameter eType describes the type of pointer to 2705** be modified, as follows: 2706** 2707** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 2708** page of pPage. 2709** 2710** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 2711** page pointed to by one of the cells on pPage. 2712** 2713** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 2714** overflow page in the list. 2715*/ 2716static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 2717 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 2718 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 2719 if( eType==PTRMAP_OVERFLOW2 ){ 2720 /* The pointer is always the first 4 bytes of the page in this case. */ 2721 if( get4byte(pPage->aData)!=iFrom ){ 2722 return SQLITE_CORRUPT_BKPT; 2723 } 2724 put4byte(pPage->aData, iTo); 2725 }else{ 2726 u8 isInitOrig = pPage->isInit; 2727 int i; 2728 int nCell; 2729 2730 btreeInitPage(pPage); 2731 nCell = pPage->nCell; 2732 2733 for(i=0; i<nCell; i++){ 2734 u8 *pCell = findCell(pPage, i); 2735 if( eType==PTRMAP_OVERFLOW1 ){ 2736 CellInfo info; 2737 btreeParseCellPtr(pPage, pCell, &info); 2738 if( info.iOverflow ){ 2739 if( iFrom==get4byte(&pCell[info.iOverflow]) ){ 2740 put4byte(&pCell[info.iOverflow], iTo); 2741 break; 2742 } 2743 } 2744 }else{ 2745 if( get4byte(pCell)==iFrom ){ 2746 put4byte(pCell, iTo); 2747 break; 2748 } 2749 } 2750 } 2751 2752 if( i==nCell ){ 2753 if( eType!=PTRMAP_BTREE || 2754 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 2755 return SQLITE_CORRUPT_BKPT; 2756 } 2757 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 2758 } 2759 2760 pPage->isInit = isInitOrig; 2761 } 2762 return SQLITE_OK; 2763} 2764 2765 2766/* 2767** Move the open database page pDbPage to location iFreePage in the 2768** database. The pDbPage reference remains valid. 2769** 2770** The isCommit flag indicates that there is no need to remember that 2771** the journal needs to be sync()ed before database page pDbPage->pgno 2772** can be written to. The caller has already promised not to write to that 2773** page. 2774*/ 2775static int relocatePage( 2776 BtShared *pBt, /* Btree */ 2777 MemPage *pDbPage, /* Open page to move */ 2778 u8 eType, /* Pointer map 'type' entry for pDbPage */ 2779 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 2780 Pgno iFreePage, /* The location to move pDbPage to */ 2781 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 2782){ 2783 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 2784 Pgno iDbPage = pDbPage->pgno; 2785 Pager *pPager = pBt->pPager; 2786 int rc; 2787 2788 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 2789 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 2790 assert( sqlite3_mutex_held(pBt->mutex) ); 2791 assert( pDbPage->pBt==pBt ); 2792 2793 /* Move page iDbPage from its current location to page number iFreePage */ 2794 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 2795 iDbPage, iFreePage, iPtrPage, eType)); 2796 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 2797 if( rc!=SQLITE_OK ){ 2798 return rc; 2799 } 2800 pDbPage->pgno = iFreePage; 2801 2802 /* If pDbPage was a btree-page, then it may have child pages and/or cells 2803 ** that point to overflow pages. The pointer map entries for all these 2804 ** pages need to be changed. 2805 ** 2806 ** If pDbPage is an overflow page, then the first 4 bytes may store a 2807 ** pointer to a subsequent overflow page. If this is the case, then 2808 ** the pointer map needs to be updated for the subsequent overflow page. 2809 */ 2810 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 2811 rc = setChildPtrmaps(pDbPage); 2812 if( rc!=SQLITE_OK ){ 2813 return rc; 2814 } 2815 }else{ 2816 Pgno nextOvfl = get4byte(pDbPage->aData); 2817 if( nextOvfl!=0 ){ 2818 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 2819 if( rc!=SQLITE_OK ){ 2820 return rc; 2821 } 2822 } 2823 } 2824 2825 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 2826 ** that it points at iFreePage. Also fix the pointer map entry for 2827 ** iPtrPage. 2828 */ 2829 if( eType!=PTRMAP_ROOTPAGE ){ 2830 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 2831 if( rc!=SQLITE_OK ){ 2832 return rc; 2833 } 2834 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 2835 if( rc!=SQLITE_OK ){ 2836 releasePage(pPtrPage); 2837 return rc; 2838 } 2839 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 2840 releasePage(pPtrPage); 2841 if( rc==SQLITE_OK ){ 2842 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 2843 } 2844 } 2845 return rc; 2846} 2847 2848/* Forward declaration required by incrVacuumStep(). */ 2849static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 2850 2851/* 2852** Perform a single step of an incremental-vacuum. If successful, 2853** return SQLITE_OK. If there is no work to do (and therefore no 2854** point in calling this function again), return SQLITE_DONE. 2855** 2856** More specificly, this function attempts to re-organize the 2857** database so that the last page of the file currently in use 2858** is no longer in use. 2859** 2860** If the nFin parameter is non-zero, this function assumes 2861** that the caller will keep calling incrVacuumStep() until 2862** it returns SQLITE_DONE or an error, and that nFin is the 2863** number of pages the database file will contain after this 2864** process is complete. If nFin is zero, it is assumed that 2865** incrVacuumStep() will be called a finite amount of times 2866** which may or may not empty the freelist. A full autovacuum 2867** has nFin>0. A "PRAGMA incremental_vacuum" has nFin==0. 2868*/ 2869static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){ 2870 Pgno nFreeList; /* Number of pages still on the free-list */ 2871 int rc; 2872 2873 assert( sqlite3_mutex_held(pBt->mutex) ); 2874 assert( iLastPg>nFin ); 2875 2876 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 2877 u8 eType; 2878 Pgno iPtrPage; 2879 2880 nFreeList = get4byte(&pBt->pPage1->aData[36]); 2881 if( nFreeList==0 ){ 2882 return SQLITE_DONE; 2883 } 2884 2885 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 2886 if( rc!=SQLITE_OK ){ 2887 return rc; 2888 } 2889 if( eType==PTRMAP_ROOTPAGE ){ 2890 return SQLITE_CORRUPT_BKPT; 2891 } 2892 2893 if( eType==PTRMAP_FREEPAGE ){ 2894 if( nFin==0 ){ 2895 /* Remove the page from the files free-list. This is not required 2896 ** if nFin is non-zero. In that case, the free-list will be 2897 ** truncated to zero after this function returns, so it doesn't 2898 ** matter if it still contains some garbage entries. 2899 */ 2900 Pgno iFreePg; 2901 MemPage *pFreePg; 2902 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); 2903 if( rc!=SQLITE_OK ){ 2904 return rc; 2905 } 2906 assert( iFreePg==iLastPg ); 2907 releasePage(pFreePg); 2908 } 2909 } else { 2910 Pgno iFreePg; /* Index of free page to move pLastPg to */ 2911 MemPage *pLastPg; 2912 2913 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 2914 if( rc!=SQLITE_OK ){ 2915 return rc; 2916 } 2917 2918 /* If nFin is zero, this loop runs exactly once and page pLastPg 2919 ** is swapped with the first free page pulled off the free list. 2920 ** 2921 ** On the other hand, if nFin is greater than zero, then keep 2922 ** looping until a free-page located within the first nFin pages 2923 ** of the file is found. 2924 */ 2925 do { 2926 MemPage *pFreePg; 2927 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); 2928 if( rc!=SQLITE_OK ){ 2929 releasePage(pLastPg); 2930 return rc; 2931 } 2932 releasePage(pFreePg); 2933 }while( nFin!=0 && iFreePg>nFin ); 2934 assert( iFreePg<iLastPg ); 2935 2936 rc = sqlite3PagerWrite(pLastPg->pDbPage); 2937 if( rc==SQLITE_OK ){ 2938 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0); 2939 } 2940 releasePage(pLastPg); 2941 if( rc!=SQLITE_OK ){ 2942 return rc; 2943 } 2944 } 2945 } 2946 2947 if( nFin==0 ){ 2948 iLastPg--; 2949 while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){ 2950 if( PTRMAP_ISPAGE(pBt, iLastPg) ){ 2951 MemPage *pPg; 2952 rc = btreeGetPage(pBt, iLastPg, &pPg, 0); 2953 if( rc!=SQLITE_OK ){ 2954 return rc; 2955 } 2956 rc = sqlite3PagerWrite(pPg->pDbPage); 2957 releasePage(pPg); 2958 if( rc!=SQLITE_OK ){ 2959 return rc; 2960 } 2961 } 2962 iLastPg--; 2963 } 2964 sqlite3PagerTruncateImage(pBt->pPager, iLastPg); 2965 pBt->nPage = iLastPg; 2966 } 2967 return SQLITE_OK; 2968} 2969 2970/* 2971** A write-transaction must be opened before calling this function. 2972** It performs a single unit of work towards an incremental vacuum. 2973** 2974** If the incremental vacuum is finished after this function has run, 2975** SQLITE_DONE is returned. If it is not finished, but no error occurred, 2976** SQLITE_OK is returned. Otherwise an SQLite error code. 2977*/ 2978int sqlite3BtreeIncrVacuum(Btree *p){ 2979 int rc; 2980 BtShared *pBt = p->pBt; 2981 2982 sqlite3BtreeEnter(p); 2983 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 2984 if( !pBt->autoVacuum ){ 2985 rc = SQLITE_DONE; 2986 }else{ 2987 invalidateAllOverflowCache(pBt); 2988 rc = incrVacuumStep(pBt, 0, btreePagecount(pBt)); 2989 if( rc==SQLITE_OK ){ 2990 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 2991 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 2992 } 2993 } 2994 sqlite3BtreeLeave(p); 2995 return rc; 2996} 2997 2998/* 2999** This routine is called prior to sqlite3PagerCommit when a transaction 3000** is commited for an auto-vacuum database. 3001** 3002** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages 3003** the database file should be truncated to during the commit process. 3004** i.e. the database has been reorganized so that only the first *pnTrunc 3005** pages are in use. 3006*/ 3007static int autoVacuumCommit(BtShared *pBt){ 3008 int rc = SQLITE_OK; 3009 Pager *pPager = pBt->pPager; 3010 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) ); 3011 3012 assert( sqlite3_mutex_held(pBt->mutex) ); 3013 invalidateAllOverflowCache(pBt); 3014 assert(pBt->autoVacuum); 3015 if( !pBt->incrVacuum ){ 3016 Pgno nFin; /* Number of pages in database after autovacuuming */ 3017 Pgno nFree; /* Number of pages on the freelist initially */ 3018 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 3019 Pgno iFree; /* The next page to be freed */ 3020 int nEntry; /* Number of entries on one ptrmap page */ 3021 Pgno nOrig; /* Database size before freeing */ 3022 3023 nOrig = btreePagecount(pBt); 3024 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 3025 /* It is not possible to create a database for which the final page 3026 ** is either a pointer-map page or the pending-byte page. If one 3027 ** is encountered, this indicates corruption. 3028 */ 3029 return SQLITE_CORRUPT_BKPT; 3030 } 3031 3032 nFree = get4byte(&pBt->pPage1->aData[36]); 3033 nEntry = pBt->usableSize/5; 3034 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 3035 nFin = nOrig - nFree - nPtrmap; 3036 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 3037 nFin--; 3038 } 3039 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 3040 nFin--; 3041 } 3042 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 3043 3044 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 3045 rc = incrVacuumStep(pBt, nFin, iFree); 3046 } 3047 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 3048 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 3049 put4byte(&pBt->pPage1->aData[32], 0); 3050 put4byte(&pBt->pPage1->aData[36], 0); 3051 put4byte(&pBt->pPage1->aData[28], nFin); 3052 sqlite3PagerTruncateImage(pBt->pPager, nFin); 3053 pBt->nPage = nFin; 3054 } 3055 if( rc!=SQLITE_OK ){ 3056 sqlite3PagerRollback(pPager); 3057 } 3058 } 3059 3060 assert( nRef==sqlite3PagerRefcount(pPager) ); 3061 return rc; 3062} 3063 3064#else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 3065# define setChildPtrmaps(x) SQLITE_OK 3066#endif 3067 3068/* 3069** This routine does the first phase of a two-phase commit. This routine 3070** causes a rollback journal to be created (if it does not already exist) 3071** and populated with enough information so that if a power loss occurs 3072** the database can be restored to its original state by playing back 3073** the journal. Then the contents of the journal are flushed out to 3074** the disk. After the journal is safely on oxide, the changes to the 3075** database are written into the database file and flushed to oxide. 3076** At the end of this call, the rollback journal still exists on the 3077** disk and we are still holding all locks, so the transaction has not 3078** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 3079** commit process. 3080** 3081** This call is a no-op if no write-transaction is currently active on pBt. 3082** 3083** Otherwise, sync the database file for the btree pBt. zMaster points to 3084** the name of a master journal file that should be written into the 3085** individual journal file, or is NULL, indicating no master journal file 3086** (single database transaction). 3087** 3088** When this is called, the master journal should already have been 3089** created, populated with this journal pointer and synced to disk. 3090** 3091** Once this is routine has returned, the only thing required to commit 3092** the write-transaction for this database file is to delete the journal. 3093*/ 3094int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ 3095 int rc = SQLITE_OK; 3096 if( p->inTrans==TRANS_WRITE ){ 3097 BtShared *pBt = p->pBt; 3098 sqlite3BtreeEnter(p); 3099#ifndef SQLITE_OMIT_AUTOVACUUM 3100 if( pBt->autoVacuum ){ 3101 rc = autoVacuumCommit(pBt); 3102 if( rc!=SQLITE_OK ){ 3103 sqlite3BtreeLeave(p); 3104 return rc; 3105 } 3106 } 3107#endif 3108 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0); 3109 sqlite3BtreeLeave(p); 3110 } 3111 return rc; 3112} 3113 3114/* 3115** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 3116** at the conclusion of a transaction. 3117*/ 3118static void btreeEndTransaction(Btree *p){ 3119 BtShared *pBt = p->pBt; 3120 assert( sqlite3BtreeHoldsMutex(p) ); 3121 3122 btreeClearHasContent(pBt); 3123 if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){ 3124 /* If there are other active statements that belong to this database 3125 ** handle, downgrade to a read-only transaction. The other statements 3126 ** may still be reading from the database. */ 3127 downgradeAllSharedCacheTableLocks(p); 3128 p->inTrans = TRANS_READ; 3129 }else{ 3130 /* If the handle had any kind of transaction open, decrement the 3131 ** transaction count of the shared btree. If the transaction count 3132 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 3133 ** call below will unlock the pager. */ 3134 if( p->inTrans!=TRANS_NONE ){ 3135 clearAllSharedCacheTableLocks(p); 3136 pBt->nTransaction--; 3137 if( 0==pBt->nTransaction ){ 3138 pBt->inTransaction = TRANS_NONE; 3139 } 3140 } 3141 3142 /* Set the current transaction state to TRANS_NONE and unlock the 3143 ** pager if this call closed the only read or write transaction. */ 3144 p->inTrans = TRANS_NONE; 3145 unlockBtreeIfUnused(pBt); 3146 } 3147 3148 btreeIntegrity(p); 3149} 3150 3151/* 3152** Commit the transaction currently in progress. 3153** 3154** This routine implements the second phase of a 2-phase commit. The 3155** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 3156** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 3157** routine did all the work of writing information out to disk and flushing the 3158** contents so that they are written onto the disk platter. All this 3159** routine has to do is delete or truncate or zero the header in the 3160** the rollback journal (which causes the transaction to commit) and 3161** drop locks. 3162** 3163** Normally, if an error occurs while the pager layer is attempting to 3164** finalize the underlying journal file, this function returns an error and 3165** the upper layer will attempt a rollback. However, if the second argument 3166** is non-zero then this b-tree transaction is part of a multi-file 3167** transaction. In this case, the transaction has already been committed 3168** (by deleting a master journal file) and the caller will ignore this 3169** functions return code. So, even if an error occurs in the pager layer, 3170** reset the b-tree objects internal state to indicate that the write 3171** transaction has been closed. This is quite safe, as the pager will have 3172** transitioned to the error state. 3173** 3174** This will release the write lock on the database file. If there 3175** are no active cursors, it also releases the read lock. 3176*/ 3177int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ 3178 3179 if( p->inTrans==TRANS_NONE ) return SQLITE_OK; 3180 sqlite3BtreeEnter(p); 3181 btreeIntegrity(p); 3182 3183 /* If the handle has a write-transaction open, commit the shared-btrees 3184 ** transaction and set the shared state to TRANS_READ. 3185 */ 3186 if( p->inTrans==TRANS_WRITE ){ 3187 int rc; 3188 BtShared *pBt = p->pBt; 3189 assert( pBt->inTransaction==TRANS_WRITE ); 3190 assert( pBt->nTransaction>0 ); 3191 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 3192 if( rc!=SQLITE_OK && bCleanup==0 ){ 3193 sqlite3BtreeLeave(p); 3194 return rc; 3195 } 3196 pBt->inTransaction = TRANS_READ; 3197 } 3198 3199 btreeEndTransaction(p); 3200 sqlite3BtreeLeave(p); 3201 return SQLITE_OK; 3202} 3203 3204/* 3205** Do both phases of a commit. 3206*/ 3207int sqlite3BtreeCommit(Btree *p){ 3208 int rc; 3209 sqlite3BtreeEnter(p); 3210 rc = sqlite3BtreeCommitPhaseOne(p, 0); 3211 if( rc==SQLITE_OK ){ 3212 rc = sqlite3BtreeCommitPhaseTwo(p, 0); 3213 } 3214 sqlite3BtreeLeave(p); 3215 return rc; 3216} 3217 3218#ifndef NDEBUG 3219/* 3220** Return the number of write-cursors open on this handle. This is for use 3221** in assert() expressions, so it is only compiled if NDEBUG is not 3222** defined. 3223** 3224** For the purposes of this routine, a write-cursor is any cursor that 3225** is capable of writing to the databse. That means the cursor was 3226** originally opened for writing and the cursor has not be disabled 3227** by having its state changed to CURSOR_FAULT. 3228*/ 3229static int countWriteCursors(BtShared *pBt){ 3230 BtCursor *pCur; 3231 int r = 0; 3232 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 3233 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 3234 } 3235 return r; 3236} 3237#endif 3238 3239/* 3240** This routine sets the state to CURSOR_FAULT and the error 3241** code to errCode for every cursor on BtShared that pBtree 3242** references. 3243** 3244** Every cursor is tripped, including cursors that belong 3245** to other database connections that happen to be sharing 3246** the cache with pBtree. 3247** 3248** This routine gets called when a rollback occurs. 3249** All cursors using the same cache must be tripped 3250** to prevent them from trying to use the btree after 3251** the rollback. The rollback may have deleted tables 3252** or moved root pages, so it is not sufficient to 3253** save the state of the cursor. The cursor must be 3254** invalidated. 3255*/ 3256void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ 3257 BtCursor *p; 3258 sqlite3BtreeEnter(pBtree); 3259 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 3260 int i; 3261 sqlite3BtreeClearCursor(p); 3262 p->eState = CURSOR_FAULT; 3263 p->skipNext = errCode; 3264 for(i=0; i<=p->iPage; i++){ 3265 releasePage(p->apPage[i]); 3266 p->apPage[i] = 0; 3267 } 3268 } 3269 sqlite3BtreeLeave(pBtree); 3270} 3271 3272/* 3273** Rollback the transaction in progress. All cursors will be 3274** invalided by this operation. Any attempt to use a cursor 3275** that was open at the beginning of this operation will result 3276** in an error. 3277** 3278** This will release the write lock on the database file. If there 3279** are no active cursors, it also releases the read lock. 3280*/ 3281int sqlite3BtreeRollback(Btree *p){ 3282 int rc; 3283 BtShared *pBt = p->pBt; 3284 MemPage *pPage1; 3285 3286 sqlite3BtreeEnter(p); 3287 rc = saveAllCursors(pBt, 0, 0); 3288#ifndef SQLITE_OMIT_SHARED_CACHE 3289 if( rc!=SQLITE_OK ){ 3290 /* This is a horrible situation. An IO or malloc() error occurred whilst 3291 ** trying to save cursor positions. If this is an automatic rollback (as 3292 ** the result of a constraint, malloc() failure or IO error) then 3293 ** the cache may be internally inconsistent (not contain valid trees) so 3294 ** we cannot simply return the error to the caller. Instead, abort 3295 ** all queries that may be using any of the cursors that failed to save. 3296 */ 3297 sqlite3BtreeTripAllCursors(p, rc); 3298 } 3299#endif 3300 btreeIntegrity(p); 3301 3302 if( p->inTrans==TRANS_WRITE ){ 3303 int rc2; 3304 3305 assert( TRANS_WRITE==pBt->inTransaction ); 3306 rc2 = sqlite3PagerRollback(pBt->pPager); 3307 if( rc2!=SQLITE_OK ){ 3308 rc = rc2; 3309 } 3310 3311 /* The rollback may have destroyed the pPage1->aData value. So 3312 ** call btreeGetPage() on page 1 again to make 3313 ** sure pPage1->aData is set correctly. */ 3314 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 3315 int nPage = get4byte(28+(u8*)pPage1->aData); 3316 testcase( nPage==0 ); 3317 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 3318 testcase( pBt->nPage!=nPage ); 3319 pBt->nPage = nPage; 3320 releasePage(pPage1); 3321 } 3322 assert( countWriteCursors(pBt)==0 ); 3323 pBt->inTransaction = TRANS_READ; 3324 } 3325 3326 btreeEndTransaction(p); 3327 sqlite3BtreeLeave(p); 3328 return rc; 3329} 3330 3331/* 3332** Start a statement subtransaction. The subtransaction can can be rolled 3333** back independently of the main transaction. You must start a transaction 3334** before starting a subtransaction. The subtransaction is ended automatically 3335** if the main transaction commits or rolls back. 3336** 3337** Statement subtransactions are used around individual SQL statements 3338** that are contained within a BEGIN...COMMIT block. If a constraint 3339** error occurs within the statement, the effect of that one statement 3340** can be rolled back without having to rollback the entire transaction. 3341** 3342** A statement sub-transaction is implemented as an anonymous savepoint. The 3343** value passed as the second parameter is the total number of savepoints, 3344** including the new anonymous savepoint, open on the B-Tree. i.e. if there 3345** are no active savepoints and no other statement-transactions open, 3346** iStatement is 1. This anonymous savepoint can be released or rolled back 3347** using the sqlite3BtreeSavepoint() function. 3348*/ 3349int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 3350 int rc; 3351 BtShared *pBt = p->pBt; 3352 sqlite3BtreeEnter(p); 3353 assert( p->inTrans==TRANS_WRITE ); 3354 assert( pBt->readOnly==0 ); 3355 assert( iStatement>0 ); 3356 assert( iStatement>p->db->nSavepoint ); 3357 assert( pBt->inTransaction==TRANS_WRITE ); 3358 /* At the pager level, a statement transaction is a savepoint with 3359 ** an index greater than all savepoints created explicitly using 3360 ** SQL statements. It is illegal to open, release or rollback any 3361 ** such savepoints while the statement transaction savepoint is active. 3362 */ 3363 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 3364 sqlite3BtreeLeave(p); 3365 return rc; 3366} 3367 3368/* 3369** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 3370** or SAVEPOINT_RELEASE. This function either releases or rolls back the 3371** savepoint identified by parameter iSavepoint, depending on the value 3372** of op. 3373** 3374** Normally, iSavepoint is greater than or equal to zero. However, if op is 3375** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 3376** contents of the entire transaction are rolled back. This is different 3377** from a normal transaction rollback, as no locks are released and the 3378** transaction remains open. 3379*/ 3380int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 3381 int rc = SQLITE_OK; 3382 if( p && p->inTrans==TRANS_WRITE ){ 3383 BtShared *pBt = p->pBt; 3384 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 3385 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 3386 sqlite3BtreeEnter(p); 3387 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 3388 if( rc==SQLITE_OK ){ 3389 if( iSavepoint<0 && pBt->initiallyEmpty ) pBt->nPage = 0; 3390 rc = newDatabase(pBt); 3391 pBt->nPage = get4byte(28 + pBt->pPage1->aData); 3392 3393 /* The database size was written into the offset 28 of the header 3394 ** when the transaction started, so we know that the value at offset 3395 ** 28 is nonzero. */ 3396 assert( pBt->nPage>0 ); 3397 } 3398 sqlite3BtreeLeave(p); 3399 } 3400 return rc; 3401} 3402 3403/* 3404** Create a new cursor for the BTree whose root is on the page 3405** iTable. If a read-only cursor is requested, it is assumed that 3406** the caller already has at least a read-only transaction open 3407** on the database already. If a write-cursor is requested, then 3408** the caller is assumed to have an open write transaction. 3409** 3410** If wrFlag==0, then the cursor can only be used for reading. 3411** If wrFlag==1, then the cursor can be used for reading or for 3412** writing if other conditions for writing are also met. These 3413** are the conditions that must be met in order for writing to 3414** be allowed: 3415** 3416** 1: The cursor must have been opened with wrFlag==1 3417** 3418** 2: Other database connections that share the same pager cache 3419** but which are not in the READ_UNCOMMITTED state may not have 3420** cursors open with wrFlag==0 on the same table. Otherwise 3421** the changes made by this write cursor would be visible to 3422** the read cursors in the other database connection. 3423** 3424** 3: The database must be writable (not on read-only media) 3425** 3426** 4: There must be an active transaction. 3427** 3428** No checking is done to make sure that page iTable really is the 3429** root page of a b-tree. If it is not, then the cursor acquired 3430** will not work correctly. 3431** 3432** It is assumed that the sqlite3BtreeCursorZero() has been called 3433** on pCur to initialize the memory space prior to invoking this routine. 3434*/ 3435static int btreeCursor( 3436 Btree *p, /* The btree */ 3437 int iTable, /* Root page of table to open */ 3438 int wrFlag, /* 1 to write. 0 read-only */ 3439 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 3440 BtCursor *pCur /* Space for new cursor */ 3441){ 3442 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 3443 3444 assert( sqlite3BtreeHoldsMutex(p) ); 3445 assert( wrFlag==0 || wrFlag==1 ); 3446 3447 /* The following assert statements verify that if this is a sharable 3448 ** b-tree database, the connection is holding the required table locks, 3449 ** and that no other connection has any open cursor that conflicts with 3450 ** this lock. */ 3451 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) ); 3452 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 3453 3454 /* Assert that the caller has opened the required transaction. */ 3455 assert( p->inTrans>TRANS_NONE ); 3456 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 3457 assert( pBt->pPage1 && pBt->pPage1->aData ); 3458 3459 if( NEVER(wrFlag && pBt->readOnly) ){ 3460 return SQLITE_READONLY; 3461 } 3462 if( iTable==1 && btreePagecount(pBt)==0 ){ 3463 return SQLITE_EMPTY; 3464 } 3465 3466 /* Now that no other errors can occur, finish filling in the BtCursor 3467 ** variables and link the cursor into the BtShared list. */ 3468 pCur->pgnoRoot = (Pgno)iTable; 3469 pCur->iPage = -1; 3470 pCur->pKeyInfo = pKeyInfo; 3471 pCur->pBtree = p; 3472 pCur->pBt = pBt; 3473 pCur->wrFlag = (u8)wrFlag; 3474 pCur->pNext = pBt->pCursor; 3475 if( pCur->pNext ){ 3476 pCur->pNext->pPrev = pCur; 3477 } 3478 pBt->pCursor = pCur; 3479 pCur->eState = CURSOR_INVALID; 3480 pCur->cachedRowid = 0; 3481 return SQLITE_OK; 3482} 3483int sqlite3BtreeCursor( 3484 Btree *p, /* The btree */ 3485 int iTable, /* Root page of table to open */ 3486 int wrFlag, /* 1 to write. 0 read-only */ 3487 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 3488 BtCursor *pCur /* Write new cursor here */ 3489){ 3490 int rc; 3491 sqlite3BtreeEnter(p); 3492 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 3493 sqlite3BtreeLeave(p); 3494 return rc; 3495} 3496 3497/* 3498** Return the size of a BtCursor object in bytes. 3499** 3500** This interfaces is needed so that users of cursors can preallocate 3501** sufficient storage to hold a cursor. The BtCursor object is opaque 3502** to users so they cannot do the sizeof() themselves - they must call 3503** this routine. 3504*/ 3505int sqlite3BtreeCursorSize(void){ 3506 return ROUND8(sizeof(BtCursor)); 3507} 3508 3509/* 3510** Initialize memory that will be converted into a BtCursor object. 3511** 3512** The simple approach here would be to memset() the entire object 3513** to zero. But it turns out that the apPage[] and aiIdx[] arrays 3514** do not need to be zeroed and they are large, so we can save a lot 3515** of run-time by skipping the initialization of those elements. 3516*/ 3517void sqlite3BtreeCursorZero(BtCursor *p){ 3518 memset(p, 0, offsetof(BtCursor, iPage)); 3519} 3520 3521/* 3522** Set the cached rowid value of every cursor in the same database file 3523** as pCur and having the same root page number as pCur. The value is 3524** set to iRowid. 3525** 3526** Only positive rowid values are considered valid for this cache. 3527** The cache is initialized to zero, indicating an invalid cache. 3528** A btree will work fine with zero or negative rowids. We just cannot 3529** cache zero or negative rowids, which means tables that use zero or 3530** negative rowids might run a little slower. But in practice, zero 3531** or negative rowids are very uncommon so this should not be a problem. 3532*/ 3533void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){ 3534 BtCursor *p; 3535 for(p=pCur->pBt->pCursor; p; p=p->pNext){ 3536 if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid; 3537 } 3538 assert( pCur->cachedRowid==iRowid ); 3539} 3540 3541/* 3542** Return the cached rowid for the given cursor. A negative or zero 3543** return value indicates that the rowid cache is invalid and should be 3544** ignored. If the rowid cache has never before been set, then a 3545** zero is returned. 3546*/ 3547sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){ 3548 return pCur->cachedRowid; 3549} 3550 3551/* 3552** Close a cursor. The read lock on the database file is released 3553** when the last cursor is closed. 3554*/ 3555int sqlite3BtreeCloseCursor(BtCursor *pCur){ 3556 Btree *pBtree = pCur->pBtree; 3557 if( pBtree ){ 3558 int i; 3559 BtShared *pBt = pCur->pBt; 3560 sqlite3BtreeEnter(pBtree); 3561 sqlite3BtreeClearCursor(pCur); 3562 if( pCur->pPrev ){ 3563 pCur->pPrev->pNext = pCur->pNext; 3564 }else{ 3565 pBt->pCursor = pCur->pNext; 3566 } 3567 if( pCur->pNext ){ 3568 pCur->pNext->pPrev = pCur->pPrev; 3569 } 3570 for(i=0; i<=pCur->iPage; i++){ 3571 releasePage(pCur->apPage[i]); 3572 } 3573 unlockBtreeIfUnused(pBt); 3574 invalidateOverflowCache(pCur); 3575 /* sqlite3_free(pCur); */ 3576 sqlite3BtreeLeave(pBtree); 3577 } 3578 return SQLITE_OK; 3579} 3580 3581/* 3582** Make sure the BtCursor* given in the argument has a valid 3583** BtCursor.info structure. If it is not already valid, call 3584** btreeParseCell() to fill it in. 3585** 3586** BtCursor.info is a cache of the information in the current cell. 3587** Using this cache reduces the number of calls to btreeParseCell(). 3588** 3589** 2007-06-25: There is a bug in some versions of MSVC that cause the 3590** compiler to crash when getCellInfo() is implemented as a macro. 3591** But there is a measureable speed advantage to using the macro on gcc 3592** (when less compiler optimizations like -Os or -O0 are used and the 3593** compiler is not doing agressive inlining.) So we use a real function 3594** for MSVC and a macro for everything else. Ticket #2457. 3595*/ 3596#ifndef NDEBUG 3597 static void assertCellInfo(BtCursor *pCur){ 3598 CellInfo info; 3599 int iPage = pCur->iPage; 3600 memset(&info, 0, sizeof(info)); 3601 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info); 3602 assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); 3603 } 3604#else 3605 #define assertCellInfo(x) 3606#endif 3607#ifdef _MSC_VER 3608 /* Use a real function in MSVC to work around bugs in that compiler. */ 3609 static void getCellInfo(BtCursor *pCur){ 3610 if( pCur->info.nSize==0 ){ 3611 int iPage = pCur->iPage; 3612 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); 3613 pCur->validNKey = 1; 3614 }else{ 3615 assertCellInfo(pCur); 3616 } 3617 } 3618#else /* if not _MSC_VER */ 3619 /* Use a macro in all other compilers so that the function is inlined */ 3620#define getCellInfo(pCur) \ 3621 if( pCur->info.nSize==0 ){ \ 3622 int iPage = pCur->iPage; \ 3623 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \ 3624 pCur->validNKey = 1; \ 3625 }else{ \ 3626 assertCellInfo(pCur); \ 3627 } 3628#endif /* _MSC_VER */ 3629 3630#ifndef NDEBUG /* The next routine used only within assert() statements */ 3631/* 3632** Return true if the given BtCursor is valid. A valid cursor is one 3633** that is currently pointing to a row in a (non-empty) table. 3634** This is a verification routine is used only within assert() statements. 3635*/ 3636int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 3637 return pCur && pCur->eState==CURSOR_VALID; 3638} 3639#endif /* NDEBUG */ 3640 3641/* 3642** Set *pSize to the size of the buffer needed to hold the value of 3643** the key for the current entry. If the cursor is not pointing 3644** to a valid entry, *pSize is set to 0. 3645** 3646** For a table with the INTKEY flag set, this routine returns the key 3647** itself, not the number of bytes in the key. 3648** 3649** The caller must position the cursor prior to invoking this routine. 3650** 3651** This routine cannot fail. It always returns SQLITE_OK. 3652*/ 3653int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ 3654 assert( cursorHoldsMutex(pCur) ); 3655 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); 3656 if( pCur->eState!=CURSOR_VALID ){ 3657 *pSize = 0; 3658 }else{ 3659 getCellInfo(pCur); 3660 *pSize = pCur->info.nKey; 3661 } 3662 return SQLITE_OK; 3663} 3664 3665/* 3666** Set *pSize to the number of bytes of data in the entry the 3667** cursor currently points to. 3668** 3669** The caller must guarantee that the cursor is pointing to a non-NULL 3670** valid entry. In other words, the calling procedure must guarantee 3671** that the cursor has Cursor.eState==CURSOR_VALID. 3672** 3673** Failure is not possible. This function always returns SQLITE_OK. 3674** It might just as well be a procedure (returning void) but we continue 3675** to return an integer result code for historical reasons. 3676*/ 3677int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ 3678 assert( cursorHoldsMutex(pCur) ); 3679 assert( pCur->eState==CURSOR_VALID ); 3680 getCellInfo(pCur); 3681 *pSize = pCur->info.nData; 3682 return SQLITE_OK; 3683} 3684 3685/* 3686** Given the page number of an overflow page in the database (parameter 3687** ovfl), this function finds the page number of the next page in the 3688** linked list of overflow pages. If possible, it uses the auto-vacuum 3689** pointer-map data instead of reading the content of page ovfl to do so. 3690** 3691** If an error occurs an SQLite error code is returned. Otherwise: 3692** 3693** The page number of the next overflow page in the linked list is 3694** written to *pPgnoNext. If page ovfl is the last page in its linked 3695** list, *pPgnoNext is set to zero. 3696** 3697** If ppPage is not NULL, and a reference to the MemPage object corresponding 3698** to page number pOvfl was obtained, then *ppPage is set to point to that 3699** reference. It is the responsibility of the caller to call releasePage() 3700** on *ppPage to free the reference. In no reference was obtained (because 3701** the pointer-map was used to obtain the value for *pPgnoNext), then 3702** *ppPage is set to zero. 3703*/ 3704static int getOverflowPage( 3705 BtShared *pBt, /* The database file */ 3706 Pgno ovfl, /* Current overflow page number */ 3707 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 3708 Pgno *pPgnoNext /* OUT: Next overflow page number */ 3709){ 3710 Pgno next = 0; 3711 MemPage *pPage = 0; 3712 int rc = SQLITE_OK; 3713 3714 assert( sqlite3_mutex_held(pBt->mutex) ); 3715 assert(pPgnoNext); 3716 3717#ifndef SQLITE_OMIT_AUTOVACUUM 3718 /* Try to find the next page in the overflow list using the 3719 ** autovacuum pointer-map pages. Guess that the next page in 3720 ** the overflow list is page number (ovfl+1). If that guess turns 3721 ** out to be wrong, fall back to loading the data of page 3722 ** number ovfl to determine the next page number. 3723 */ 3724 if( pBt->autoVacuum ){ 3725 Pgno pgno; 3726 Pgno iGuess = ovfl+1; 3727 u8 eType; 3728 3729 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 3730 iGuess++; 3731 } 3732 3733 if( iGuess<=btreePagecount(pBt) ){ 3734 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 3735 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 3736 next = iGuess; 3737 rc = SQLITE_DONE; 3738 } 3739 } 3740 } 3741#endif 3742 3743 assert( next==0 || rc==SQLITE_DONE ); 3744 if( rc==SQLITE_OK ){ 3745 rc = btreeGetPage(pBt, ovfl, &pPage, 0); 3746 assert( rc==SQLITE_OK || pPage==0 ); 3747 if( rc==SQLITE_OK ){ 3748 next = get4byte(pPage->aData); 3749 } 3750 } 3751 3752 *pPgnoNext = next; 3753 if( ppPage ){ 3754 *ppPage = pPage; 3755 }else{ 3756 releasePage(pPage); 3757 } 3758 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 3759} 3760 3761/* 3762** Copy data from a buffer to a page, or from a page to a buffer. 3763** 3764** pPayload is a pointer to data stored on database page pDbPage. 3765** If argument eOp is false, then nByte bytes of data are copied 3766** from pPayload to the buffer pointed at by pBuf. If eOp is true, 3767** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 3768** of data are copied from the buffer pBuf to pPayload. 3769** 3770** SQLITE_OK is returned on success, otherwise an error code. 3771*/ 3772static int copyPayload( 3773 void *pPayload, /* Pointer to page data */ 3774 void *pBuf, /* Pointer to buffer */ 3775 int nByte, /* Number of bytes to copy */ 3776 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 3777 DbPage *pDbPage /* Page containing pPayload */ 3778){ 3779 if( eOp ){ 3780 /* Copy data from buffer to page (a write operation) */ 3781 int rc = sqlite3PagerWrite(pDbPage); 3782 if( rc!=SQLITE_OK ){ 3783 return rc; 3784 } 3785 memcpy(pPayload, pBuf, nByte); 3786 }else{ 3787 /* Copy data from page to buffer (a read operation) */ 3788 memcpy(pBuf, pPayload, nByte); 3789 } 3790 return SQLITE_OK; 3791} 3792 3793/* 3794** This function is used to read or overwrite payload information 3795** for the entry that the pCur cursor is pointing to. If the eOp 3796** parameter is 0, this is a read operation (data copied into 3797** buffer pBuf). If it is non-zero, a write (data copied from 3798** buffer pBuf). 3799** 3800** A total of "amt" bytes are read or written beginning at "offset". 3801** Data is read to or from the buffer pBuf. 3802** 3803** The content being read or written might appear on the main page 3804** or be scattered out on multiple overflow pages. 3805** 3806** If the BtCursor.isIncrblobHandle flag is set, and the current 3807** cursor entry uses one or more overflow pages, this function 3808** allocates space for and lazily popluates the overflow page-list 3809** cache array (BtCursor.aOverflow). Subsequent calls use this 3810** cache to make seeking to the supplied offset more efficient. 3811** 3812** Once an overflow page-list cache has been allocated, it may be 3813** invalidated if some other cursor writes to the same table, or if 3814** the cursor is moved to a different row. Additionally, in auto-vacuum 3815** mode, the following events may invalidate an overflow page-list cache. 3816** 3817** * An incremental vacuum, 3818** * A commit in auto_vacuum="full" mode, 3819** * Creating a table (may require moving an overflow page). 3820*/ 3821static int accessPayload( 3822 BtCursor *pCur, /* Cursor pointing to entry to read from */ 3823 u32 offset, /* Begin reading this far into payload */ 3824 u32 amt, /* Read this many bytes */ 3825 unsigned char *pBuf, /* Write the bytes into this buffer */ 3826 int eOp /* zero to read. non-zero to write. */ 3827){ 3828 unsigned char *aPayload; 3829 int rc = SQLITE_OK; 3830 u32 nKey; 3831 int iIdx = 0; 3832 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */ 3833 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 3834 3835 assert( pPage ); 3836 assert( pCur->eState==CURSOR_VALID ); 3837 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 3838 assert( cursorHoldsMutex(pCur) ); 3839 3840 getCellInfo(pCur); 3841 aPayload = pCur->info.pCell + pCur->info.nHeader; 3842 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey); 3843 3844 if( NEVER(offset+amt > nKey+pCur->info.nData) 3845 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 3846 ){ 3847 /* Trying to read or write past the end of the data is an error */ 3848 return SQLITE_CORRUPT_BKPT; 3849 } 3850 3851 /* Check if data must be read/written to/from the btree page itself. */ 3852 if( offset<pCur->info.nLocal ){ 3853 int a = amt; 3854 if( a+offset>pCur->info.nLocal ){ 3855 a = pCur->info.nLocal - offset; 3856 } 3857 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 3858 offset = 0; 3859 pBuf += a; 3860 amt -= a; 3861 }else{ 3862 offset -= pCur->info.nLocal; 3863 } 3864 3865 if( rc==SQLITE_OK && amt>0 ){ 3866 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 3867 Pgno nextPage; 3868 3869 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 3870 3871#ifndef SQLITE_OMIT_INCRBLOB 3872 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] 3873 ** has not been allocated, allocate it now. The array is sized at 3874 ** one entry for each overflow page in the overflow chain. The 3875 ** page number of the first overflow page is stored in aOverflow[0], 3876 ** etc. A value of 0 in the aOverflow[] array means "not yet known" 3877 ** (the cache is lazily populated). 3878 */ 3879 if( pCur->isIncrblobHandle && !pCur->aOverflow ){ 3880 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 3881 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); 3882 /* nOvfl is always positive. If it were zero, fetchPayload would have 3883 ** been used instead of this routine. */ 3884 if( ALWAYS(nOvfl) && !pCur->aOverflow ){ 3885 rc = SQLITE_NOMEM; 3886 } 3887 } 3888 3889 /* If the overflow page-list cache has been allocated and the 3890 ** entry for the first required overflow page is valid, skip 3891 ** directly to it. 3892 */ 3893 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ 3894 iIdx = (offset/ovflSize); 3895 nextPage = pCur->aOverflow[iIdx]; 3896 offset = (offset%ovflSize); 3897 } 3898#endif 3899 3900 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ 3901 3902#ifndef SQLITE_OMIT_INCRBLOB 3903 /* If required, populate the overflow page-list cache. */ 3904 if( pCur->aOverflow ){ 3905 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); 3906 pCur->aOverflow[iIdx] = nextPage; 3907 } 3908#endif 3909 3910 if( offset>=ovflSize ){ 3911 /* The only reason to read this page is to obtain the page 3912 ** number for the next page in the overflow chain. The page 3913 ** data is not required. So first try to lookup the overflow 3914 ** page-list cache, if any, then fall back to the getOverflowPage() 3915 ** function. 3916 */ 3917#ifndef SQLITE_OMIT_INCRBLOB 3918 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ 3919 nextPage = pCur->aOverflow[iIdx+1]; 3920 } else 3921#endif 3922 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 3923 offset -= ovflSize; 3924 }else{ 3925 /* Need to read this page properly. It contains some of the 3926 ** range of data that is being read (eOp==0) or written (eOp!=0). 3927 */ 3928 DbPage *pDbPage; 3929 int a = amt; 3930 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); 3931 if( rc==SQLITE_OK ){ 3932 aPayload = sqlite3PagerGetData(pDbPage); 3933 nextPage = get4byte(aPayload); 3934 if( a + offset > ovflSize ){ 3935 a = ovflSize - offset; 3936 } 3937 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 3938 sqlite3PagerUnref(pDbPage); 3939 offset = 0; 3940 amt -= a; 3941 pBuf += a; 3942 } 3943 } 3944 } 3945 } 3946 3947 if( rc==SQLITE_OK && amt>0 ){ 3948 return SQLITE_CORRUPT_BKPT; 3949 } 3950 return rc; 3951} 3952 3953/* 3954** Read part of the key associated with cursor pCur. Exactly 3955** "amt" bytes will be transfered into pBuf[]. The transfer 3956** begins at "offset". 3957** 3958** The caller must ensure that pCur is pointing to a valid row 3959** in the table. 3960** 3961** Return SQLITE_OK on success or an error code if anything goes 3962** wrong. An error is returned if "offset+amt" is larger than 3963** the available payload. 3964*/ 3965int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3966 assert( cursorHoldsMutex(pCur) ); 3967 assert( pCur->eState==CURSOR_VALID ); 3968 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3969 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3970 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 3971} 3972 3973/* 3974** Read part of the data associated with cursor pCur. Exactly 3975** "amt" bytes will be transfered into pBuf[]. The transfer 3976** begins at "offset". 3977** 3978** Return SQLITE_OK on success or an error code if anything goes 3979** wrong. An error is returned if "offset+amt" is larger than 3980** the available payload. 3981*/ 3982int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 3983 int rc; 3984 3985#ifndef SQLITE_OMIT_INCRBLOB 3986 if ( pCur->eState==CURSOR_INVALID ){ 3987 return SQLITE_ABORT; 3988 } 3989#endif 3990 3991 assert( cursorHoldsMutex(pCur) ); 3992 rc = restoreCursorPosition(pCur); 3993 if( rc==SQLITE_OK ){ 3994 assert( pCur->eState==CURSOR_VALID ); 3995 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 3996 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 3997 rc = accessPayload(pCur, offset, amt, pBuf, 0); 3998 } 3999 return rc; 4000} 4001 4002/* 4003** Return a pointer to payload information from the entry that the 4004** pCur cursor is pointing to. The pointer is to the beginning of 4005** the key if skipKey==0 and it points to the beginning of data if 4006** skipKey==1. The number of bytes of available key/data is written 4007** into *pAmt. If *pAmt==0, then the value returned will not be 4008** a valid pointer. 4009** 4010** This routine is an optimization. It is common for the entire key 4011** and data to fit on the local page and for there to be no overflow 4012** pages. When that is so, this routine can be used to access the 4013** key and data without making a copy. If the key and/or data spills 4014** onto overflow pages, then accessPayload() must be used to reassemble 4015** the key/data and copy it into a preallocated buffer. 4016** 4017** The pointer returned by this routine looks directly into the cached 4018** page of the database. The data might change or move the next time 4019** any btree routine is called. 4020*/ 4021static const unsigned char *fetchPayload( 4022 BtCursor *pCur, /* Cursor pointing to entry to read from */ 4023 int *pAmt, /* Write the number of available bytes here */ 4024 int skipKey /* read beginning at data if this is true */ 4025){ 4026 unsigned char *aPayload; 4027 MemPage *pPage; 4028 u32 nKey; 4029 u32 nLocal; 4030 4031 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); 4032 assert( pCur->eState==CURSOR_VALID ); 4033 assert( cursorHoldsMutex(pCur) ); 4034 pPage = pCur->apPage[pCur->iPage]; 4035 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 4036 if( NEVER(pCur->info.nSize==0) ){ 4037 btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage], 4038 &pCur->info); 4039 } 4040 aPayload = pCur->info.pCell; 4041 aPayload += pCur->info.nHeader; 4042 if( pPage->intKey ){ 4043 nKey = 0; 4044 }else{ 4045 nKey = (int)pCur->info.nKey; 4046 } 4047 if( skipKey ){ 4048 aPayload += nKey; 4049 nLocal = pCur->info.nLocal - nKey; 4050 }else{ 4051 nLocal = pCur->info.nLocal; 4052 assert( nLocal<=nKey ); 4053 } 4054 *pAmt = nLocal; 4055 return aPayload; 4056} 4057 4058 4059/* 4060** For the entry that cursor pCur is point to, return as 4061** many bytes of the key or data as are available on the local 4062** b-tree page. Write the number of available bytes into *pAmt. 4063** 4064** The pointer returned is ephemeral. The key/data may move 4065** or be destroyed on the next call to any Btree routine, 4066** including calls from other threads against the same cache. 4067** Hence, a mutex on the BtShared should be held prior to calling 4068** this routine. 4069** 4070** These routines is used to get quick access to key and data 4071** in the common case where no overflow pages are used. 4072*/ 4073const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ 4074 const void *p = 0; 4075 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4076 assert( cursorHoldsMutex(pCur) ); 4077 if( ALWAYS(pCur->eState==CURSOR_VALID) ){ 4078 p = (const void*)fetchPayload(pCur, pAmt, 0); 4079 } 4080 return p; 4081} 4082const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ 4083 const void *p = 0; 4084 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4085 assert( cursorHoldsMutex(pCur) ); 4086 if( ALWAYS(pCur->eState==CURSOR_VALID) ){ 4087 p = (const void*)fetchPayload(pCur, pAmt, 1); 4088 } 4089 return p; 4090} 4091 4092 4093/* 4094** Move the cursor down to a new child page. The newPgno argument is the 4095** page number of the child page to move to. 4096** 4097** This function returns SQLITE_CORRUPT if the page-header flags field of 4098** the new child page does not match the flags field of the parent (i.e. 4099** if an intkey page appears to be the parent of a non-intkey page, or 4100** vice-versa). 4101*/ 4102static int moveToChild(BtCursor *pCur, u32 newPgno){ 4103 int rc; 4104 int i = pCur->iPage; 4105 MemPage *pNewPage; 4106 BtShared *pBt = pCur->pBt; 4107 4108 assert( cursorHoldsMutex(pCur) ); 4109 assert( pCur->eState==CURSOR_VALID ); 4110 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 4111 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 4112 return SQLITE_CORRUPT_BKPT; 4113 } 4114 rc = getAndInitPage(pBt, newPgno, &pNewPage); 4115 if( rc ) return rc; 4116 pCur->apPage[i+1] = pNewPage; 4117 pCur->aiIdx[i+1] = 0; 4118 pCur->iPage++; 4119 4120 pCur->info.nSize = 0; 4121 pCur->validNKey = 0; 4122 if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){ 4123 return SQLITE_CORRUPT_BKPT; 4124 } 4125 return SQLITE_OK; 4126} 4127 4128#ifndef NDEBUG 4129/* 4130** Page pParent is an internal (non-leaf) tree page. This function 4131** asserts that page number iChild is the left-child if the iIdx'th 4132** cell in page pParent. Or, if iIdx is equal to the total number of 4133** cells in pParent, that page number iChild is the right-child of 4134** the page. 4135*/ 4136static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 4137 assert( iIdx<=pParent->nCell ); 4138 if( iIdx==pParent->nCell ){ 4139 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 4140 }else{ 4141 assert( get4byte(findCell(pParent, iIdx))==iChild ); 4142 } 4143} 4144#else 4145# define assertParentIndex(x,y,z) 4146#endif 4147 4148/* 4149** Move the cursor up to the parent page. 4150** 4151** pCur->idx is set to the cell index that contains the pointer 4152** to the page we are coming from. If we are coming from the 4153** right-most child page then pCur->idx is set to one more than 4154** the largest cell index. 4155*/ 4156static void moveToParent(BtCursor *pCur){ 4157 assert( cursorHoldsMutex(pCur) ); 4158 assert( pCur->eState==CURSOR_VALID ); 4159 assert( pCur->iPage>0 ); 4160 assert( pCur->apPage[pCur->iPage] ); 4161 assertParentIndex( 4162 pCur->apPage[pCur->iPage-1], 4163 pCur->aiIdx[pCur->iPage-1], 4164 pCur->apPage[pCur->iPage]->pgno 4165 ); 4166 releasePage(pCur->apPage[pCur->iPage]); 4167 pCur->iPage--; 4168 pCur->info.nSize = 0; 4169 pCur->validNKey = 0; 4170} 4171 4172/* 4173** Move the cursor to point to the root page of its b-tree structure. 4174** 4175** If the table has a virtual root page, then the cursor is moved to point 4176** to the virtual root page instead of the actual root page. A table has a 4177** virtual root page when the actual root page contains no cells and a 4178** single child page. This can only happen with the table rooted at page 1. 4179** 4180** If the b-tree structure is empty, the cursor state is set to 4181** CURSOR_INVALID. Otherwise, the cursor is set to point to the first 4182** cell located on the root (or virtual root) page and the cursor state 4183** is set to CURSOR_VALID. 4184** 4185** If this function returns successfully, it may be assumed that the 4186** page-header flags indicate that the [virtual] root-page is the expected 4187** kind of b-tree page (i.e. if when opening the cursor the caller did not 4188** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 4189** indicating a table b-tree, or if the caller did specify a KeyInfo 4190** structure the flags byte is set to 0x02 or 0x0A, indicating an index 4191** b-tree). 4192*/ 4193static int moveToRoot(BtCursor *pCur){ 4194 MemPage *pRoot; 4195 int rc = SQLITE_OK; 4196 Btree *p = pCur->pBtree; 4197 BtShared *pBt = p->pBt; 4198 4199 assert( cursorHoldsMutex(pCur) ); 4200 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 4201 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 4202 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 4203 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 4204 if( pCur->eState==CURSOR_FAULT ){ 4205 assert( pCur->skipNext!=SQLITE_OK ); 4206 return pCur->skipNext; 4207 } 4208 sqlite3BtreeClearCursor(pCur); 4209 } 4210 4211 if( pCur->iPage>=0 ){ 4212 int i; 4213 for(i=1; i<=pCur->iPage; i++){ 4214 releasePage(pCur->apPage[i]); 4215 } 4216 pCur->iPage = 0; 4217 }else{ 4218 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]); 4219 if( rc!=SQLITE_OK ){ 4220 pCur->eState = CURSOR_INVALID; 4221 return rc; 4222 } 4223 pCur->iPage = 0; 4224 4225 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 4226 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 4227 ** NULL, the caller expects a table b-tree. If this is not the case, 4228 ** return an SQLITE_CORRUPT error. */ 4229 assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 ); 4230 if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){ 4231 return SQLITE_CORRUPT_BKPT; 4232 } 4233 } 4234 4235 /* Assert that the root page is of the correct type. This must be the 4236 ** case as the call to this function that loaded the root-page (either 4237 ** this call or a previous invocation) would have detected corruption 4238 ** if the assumption were not true, and it is not possible for the flags 4239 ** byte to have been modified while this cursor is holding a reference 4240 ** to the page. */ 4241 pRoot = pCur->apPage[0]; 4242 assert( pRoot->pgno==pCur->pgnoRoot ); 4243 assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey ); 4244 4245 pCur->aiIdx[0] = 0; 4246 pCur->info.nSize = 0; 4247 pCur->atLast = 0; 4248 pCur->validNKey = 0; 4249 4250 if( pRoot->nCell==0 && !pRoot->leaf ){ 4251 Pgno subpage; 4252 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 4253 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 4254 pCur->eState = CURSOR_VALID; 4255 rc = moveToChild(pCur, subpage); 4256 }else{ 4257 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID); 4258 } 4259 return rc; 4260} 4261 4262/* 4263** Move the cursor down to the left-most leaf entry beneath the 4264** entry to which it is currently pointing. 4265** 4266** The left-most leaf is the one with the smallest key - the first 4267** in ascending order. 4268*/ 4269static int moveToLeftmost(BtCursor *pCur){ 4270 Pgno pgno; 4271 int rc = SQLITE_OK; 4272 MemPage *pPage; 4273 4274 assert( cursorHoldsMutex(pCur) ); 4275 assert( pCur->eState==CURSOR_VALID ); 4276 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4277 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 4278 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage])); 4279 rc = moveToChild(pCur, pgno); 4280 } 4281 return rc; 4282} 4283 4284/* 4285** Move the cursor down to the right-most leaf entry beneath the 4286** page to which it is currently pointing. Notice the difference 4287** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 4288** finds the left-most entry beneath the *entry* whereas moveToRightmost() 4289** finds the right-most entry beneath the *page*. 4290** 4291** The right-most entry is the one with the largest key - the last 4292** key in ascending order. 4293*/ 4294static int moveToRightmost(BtCursor *pCur){ 4295 Pgno pgno; 4296 int rc = SQLITE_OK; 4297 MemPage *pPage = 0; 4298 4299 assert( cursorHoldsMutex(pCur) ); 4300 assert( pCur->eState==CURSOR_VALID ); 4301 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4302 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4303 pCur->aiIdx[pCur->iPage] = pPage->nCell; 4304 rc = moveToChild(pCur, pgno); 4305 } 4306 if( rc==SQLITE_OK ){ 4307 pCur->aiIdx[pCur->iPage] = pPage->nCell-1; 4308 pCur->info.nSize = 0; 4309 pCur->validNKey = 0; 4310 } 4311 return rc; 4312} 4313 4314/* Move the cursor to the first entry in the table. Return SQLITE_OK 4315** on success. Set *pRes to 0 if the cursor actually points to something 4316** or set *pRes to 1 if the table is empty. 4317*/ 4318int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 4319 int rc; 4320 4321 assert( cursorHoldsMutex(pCur) ); 4322 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4323 rc = moveToRoot(pCur); 4324 if( rc==SQLITE_OK ){ 4325 if( pCur->eState==CURSOR_INVALID ){ 4326 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4327 *pRes = 1; 4328 }else{ 4329 assert( pCur->apPage[pCur->iPage]->nCell>0 ); 4330 *pRes = 0; 4331 rc = moveToLeftmost(pCur); 4332 } 4333 } 4334 return rc; 4335} 4336 4337/* Move the cursor to the last entry in the table. Return SQLITE_OK 4338** on success. Set *pRes to 0 if the cursor actually points to something 4339** or set *pRes to 1 if the table is empty. 4340*/ 4341int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 4342 int rc; 4343 4344 assert( cursorHoldsMutex(pCur) ); 4345 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4346 4347 /* If the cursor already points to the last entry, this is a no-op. */ 4348 if( CURSOR_VALID==pCur->eState && pCur->atLast ){ 4349#ifdef SQLITE_DEBUG 4350 /* This block serves to assert() that the cursor really does point 4351 ** to the last entry in the b-tree. */ 4352 int ii; 4353 for(ii=0; ii<pCur->iPage; ii++){ 4354 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); 4355 } 4356 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 ); 4357 assert( pCur->apPage[pCur->iPage]->leaf ); 4358#endif 4359 return SQLITE_OK; 4360 } 4361 4362 rc = moveToRoot(pCur); 4363 if( rc==SQLITE_OK ){ 4364 if( CURSOR_INVALID==pCur->eState ){ 4365 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4366 *pRes = 1; 4367 }else{ 4368 assert( pCur->eState==CURSOR_VALID ); 4369 *pRes = 0; 4370 rc = moveToRightmost(pCur); 4371 pCur->atLast = rc==SQLITE_OK ?1:0; 4372 } 4373 } 4374 return rc; 4375} 4376 4377/* Move the cursor so that it points to an entry near the key 4378** specified by pIdxKey or intKey. Return a success code. 4379** 4380** For INTKEY tables, the intKey parameter is used. pIdxKey 4381** must be NULL. For index tables, pIdxKey is used and intKey 4382** is ignored. 4383** 4384** If an exact match is not found, then the cursor is always 4385** left pointing at a leaf page which would hold the entry if it 4386** were present. The cursor might point to an entry that comes 4387** before or after the key. 4388** 4389** An integer is written into *pRes which is the result of 4390** comparing the key with the entry to which the cursor is 4391** pointing. The meaning of the integer written into 4392** *pRes is as follows: 4393** 4394** *pRes<0 The cursor is left pointing at an entry that 4395** is smaller than intKey/pIdxKey or if the table is empty 4396** and the cursor is therefore left point to nothing. 4397** 4398** *pRes==0 The cursor is left pointing at an entry that 4399** exactly matches intKey/pIdxKey. 4400** 4401** *pRes>0 The cursor is left pointing at an entry that 4402** is larger than intKey/pIdxKey. 4403** 4404*/ 4405int sqlite3BtreeMovetoUnpacked( 4406 BtCursor *pCur, /* The cursor to be moved */ 4407 UnpackedRecord *pIdxKey, /* Unpacked index key */ 4408 i64 intKey, /* The table key */ 4409 int biasRight, /* If true, bias the search to the high end */ 4410 int *pRes /* Write search results here */ 4411){ 4412 int rc; 4413 4414 assert( cursorHoldsMutex(pCur) ); 4415 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4416 assert( pRes ); 4417 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); 4418 4419 /* If the cursor is already positioned at the point we are trying 4420 ** to move to, then just return without doing any work */ 4421 if( pCur->eState==CURSOR_VALID && pCur->validNKey 4422 && pCur->apPage[0]->intKey 4423 ){ 4424 if( pCur->info.nKey==intKey ){ 4425 *pRes = 0; 4426 return SQLITE_OK; 4427 } 4428 if( pCur->atLast && pCur->info.nKey<intKey ){ 4429 *pRes = -1; 4430 return SQLITE_OK; 4431 } 4432 } 4433 4434 rc = moveToRoot(pCur); 4435 if( rc ){ 4436 return rc; 4437 } 4438 assert( pCur->apPage[pCur->iPage] ); 4439 assert( pCur->apPage[pCur->iPage]->isInit ); 4440 assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID ); 4441 if( pCur->eState==CURSOR_INVALID ){ 4442 *pRes = -1; 4443 assert( pCur->apPage[pCur->iPage]->nCell==0 ); 4444 return SQLITE_OK; 4445 } 4446 assert( pCur->apPage[0]->intKey || pIdxKey ); 4447 for(;;){ 4448 int lwr, upr; 4449 Pgno chldPg; 4450 MemPage *pPage = pCur->apPage[pCur->iPage]; 4451 int c; 4452 4453 /* pPage->nCell must be greater than zero. If this is the root-page 4454 ** the cursor would have been INVALID above and this for(;;) loop 4455 ** not run. If this is not the root-page, then the moveToChild() routine 4456 ** would have already detected db corruption. Similarly, pPage must 4457 ** be the right kind (index or table) of b-tree page. Otherwise 4458 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 4459 assert( pPage->nCell>0 ); 4460 assert( pPage->intKey==(pIdxKey==0) ); 4461 lwr = 0; 4462 upr = pPage->nCell-1; 4463 if( biasRight ){ 4464 pCur->aiIdx[pCur->iPage] = (u16)upr; 4465 }else{ 4466 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2); 4467 } 4468 for(;;){ 4469 int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */ 4470 u8 *pCell; /* Pointer to current cell in pPage */ 4471 4472 pCur->info.nSize = 0; 4473 pCell = findCell(pPage, idx) + pPage->childPtrSize; 4474 if( pPage->intKey ){ 4475 i64 nCellKey; 4476 if( pPage->hasData ){ 4477 u32 dummy; 4478 pCell += getVarint32(pCell, dummy); 4479 } 4480 getVarint(pCell, (u64*)&nCellKey); 4481 if( nCellKey==intKey ){ 4482 c = 0; 4483 }else if( nCellKey<intKey ){ 4484 c = -1; 4485 }else{ 4486 assert( nCellKey>intKey ); 4487 c = +1; 4488 } 4489 pCur->validNKey = 1; 4490 pCur->info.nKey = nCellKey; 4491 }else{ 4492 /* The maximum supported page-size is 65536 bytes. This means that 4493 ** the maximum number of record bytes stored on an index B-Tree 4494 ** page is less than 16384 bytes and may be stored as a 2-byte 4495 ** varint. This information is used to attempt to avoid parsing 4496 ** the entire cell by checking for the cases where the record is 4497 ** stored entirely within the b-tree page by inspecting the first 4498 ** 2 bytes of the cell. 4499 */ 4500 int nCell = pCell[0]; 4501 if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){ 4502 /* This branch runs if the record-size field of the cell is a 4503 ** single byte varint and the record fits entirely on the main 4504 ** b-tree page. */ 4505 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 4506 }else if( !(pCell[1] & 0x80) 4507 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 4508 ){ 4509 /* The record-size field is a 2 byte varint and the record 4510 ** fits entirely on the main b-tree page. */ 4511 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 4512 }else{ 4513 /* The record flows over onto one or more overflow pages. In 4514 ** this case the whole cell needs to be parsed, a buffer allocated 4515 ** and accessPayload() used to retrieve the record into the 4516 ** buffer before VdbeRecordCompare() can be called. */ 4517 void *pCellKey; 4518 u8 * const pCellBody = pCell - pPage->childPtrSize; 4519 btreeParseCellPtr(pPage, pCellBody, &pCur->info); 4520 nCell = (int)pCur->info.nKey; 4521 pCellKey = sqlite3Malloc( nCell ); 4522 if( pCellKey==0 ){ 4523 rc = SQLITE_NOMEM; 4524 goto moveto_finish; 4525 } 4526 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 4527 if( rc ){ 4528 sqlite3_free(pCellKey); 4529 goto moveto_finish; 4530 } 4531 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 4532 sqlite3_free(pCellKey); 4533 } 4534 } 4535 if( c==0 ){ 4536 if( pPage->intKey && !pPage->leaf ){ 4537 lwr = idx; 4538 upr = lwr - 1; 4539 break; 4540 }else{ 4541 *pRes = 0; 4542 rc = SQLITE_OK; 4543 goto moveto_finish; 4544 } 4545 } 4546 if( c<0 ){ 4547 lwr = idx+1; 4548 }else{ 4549 upr = idx-1; 4550 } 4551 if( lwr>upr ){ 4552 break; 4553 } 4554 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2); 4555 } 4556 assert( lwr==upr+1 ); 4557 assert( pPage->isInit ); 4558 if( pPage->leaf ){ 4559 chldPg = 0; 4560 }else if( lwr>=pPage->nCell ){ 4561 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4562 }else{ 4563 chldPg = get4byte(findCell(pPage, lwr)); 4564 } 4565 if( chldPg==0 ){ 4566 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4567 *pRes = c; 4568 rc = SQLITE_OK; 4569 goto moveto_finish; 4570 } 4571 pCur->aiIdx[pCur->iPage] = (u16)lwr; 4572 pCur->info.nSize = 0; 4573 pCur->validNKey = 0; 4574 rc = moveToChild(pCur, chldPg); 4575 if( rc ) goto moveto_finish; 4576 } 4577moveto_finish: 4578 return rc; 4579} 4580 4581 4582/* 4583** Return TRUE if the cursor is not pointing at an entry of the table. 4584** 4585** TRUE will be returned after a call to sqlite3BtreeNext() moves 4586** past the last entry in the table or sqlite3BtreePrev() moves past 4587** the first entry. TRUE is also returned if the table is empty. 4588*/ 4589int sqlite3BtreeEof(BtCursor *pCur){ 4590 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 4591 ** have been deleted? This API will need to change to return an error code 4592 ** as well as the boolean result value. 4593 */ 4594 return (CURSOR_VALID!=pCur->eState); 4595} 4596 4597/* 4598** Advance the cursor to the next entry in the database. If 4599** successful then set *pRes=0. If the cursor 4600** was already pointing to the last entry in the database before 4601** this routine was called, then set *pRes=1. 4602*/ 4603int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 4604 int rc; 4605 int idx; 4606 MemPage *pPage; 4607 4608 assert( cursorHoldsMutex(pCur) ); 4609 rc = restoreCursorPosition(pCur); 4610 if( rc!=SQLITE_OK ){ 4611 return rc; 4612 } 4613 assert( pRes!=0 ); 4614 if( CURSOR_INVALID==pCur->eState ){ 4615 *pRes = 1; 4616 return SQLITE_OK; 4617 } 4618 if( pCur->skipNext>0 ){ 4619 pCur->skipNext = 0; 4620 *pRes = 0; 4621 return SQLITE_OK; 4622 } 4623 pCur->skipNext = 0; 4624 4625 pPage = pCur->apPage[pCur->iPage]; 4626 idx = ++pCur->aiIdx[pCur->iPage]; 4627 assert( pPage->isInit ); 4628 assert( idx<=pPage->nCell ); 4629 4630 pCur->info.nSize = 0; 4631 pCur->validNKey = 0; 4632 if( idx>=pPage->nCell ){ 4633 if( !pPage->leaf ){ 4634 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 4635 if( rc ) return rc; 4636 rc = moveToLeftmost(pCur); 4637 *pRes = 0; 4638 return rc; 4639 } 4640 do{ 4641 if( pCur->iPage==0 ){ 4642 *pRes = 1; 4643 pCur->eState = CURSOR_INVALID; 4644 return SQLITE_OK; 4645 } 4646 moveToParent(pCur); 4647 pPage = pCur->apPage[pCur->iPage]; 4648 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); 4649 *pRes = 0; 4650 if( pPage->intKey ){ 4651 rc = sqlite3BtreeNext(pCur, pRes); 4652 }else{ 4653 rc = SQLITE_OK; 4654 } 4655 return rc; 4656 } 4657 *pRes = 0; 4658 if( pPage->leaf ){ 4659 return SQLITE_OK; 4660 } 4661 rc = moveToLeftmost(pCur); 4662 return rc; 4663} 4664 4665 4666/* 4667** Step the cursor to the back to the previous entry in the database. If 4668** successful then set *pRes=0. If the cursor 4669** was already pointing to the first entry in the database before 4670** this routine was called, then set *pRes=1. 4671*/ 4672int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 4673 int rc; 4674 MemPage *pPage; 4675 4676 assert( cursorHoldsMutex(pCur) ); 4677 rc = restoreCursorPosition(pCur); 4678 if( rc!=SQLITE_OK ){ 4679 return rc; 4680 } 4681 pCur->atLast = 0; 4682 if( CURSOR_INVALID==pCur->eState ){ 4683 *pRes = 1; 4684 return SQLITE_OK; 4685 } 4686 if( pCur->skipNext<0 ){ 4687 pCur->skipNext = 0; 4688 *pRes = 0; 4689 return SQLITE_OK; 4690 } 4691 pCur->skipNext = 0; 4692 4693 pPage = pCur->apPage[pCur->iPage]; 4694 assert( pPage->isInit ); 4695 if( !pPage->leaf ){ 4696 int idx = pCur->aiIdx[pCur->iPage]; 4697 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 4698 if( rc ){ 4699 return rc; 4700 } 4701 rc = moveToRightmost(pCur); 4702 }else{ 4703 while( pCur->aiIdx[pCur->iPage]==0 ){ 4704 if( pCur->iPage==0 ){ 4705 pCur->eState = CURSOR_INVALID; 4706 *pRes = 1; 4707 return SQLITE_OK; 4708 } 4709 moveToParent(pCur); 4710 } 4711 pCur->info.nSize = 0; 4712 pCur->validNKey = 0; 4713 4714 pCur->aiIdx[pCur->iPage]--; 4715 pPage = pCur->apPage[pCur->iPage]; 4716 if( pPage->intKey && !pPage->leaf ){ 4717 rc = sqlite3BtreePrevious(pCur, pRes); 4718 }else{ 4719 rc = SQLITE_OK; 4720 } 4721 } 4722 *pRes = 0; 4723 return rc; 4724} 4725 4726/* 4727** Allocate a new page from the database file. 4728** 4729** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 4730** has already been called on the new page.) The new page has also 4731** been referenced and the calling routine is responsible for calling 4732** sqlite3PagerUnref() on the new page when it is done. 4733** 4734** SQLITE_OK is returned on success. Any other return value indicates 4735** an error. *ppPage and *pPgno are undefined in the event of an error. 4736** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 4737** 4738** If the "nearby" parameter is not 0, then a (feeble) effort is made to 4739** locate a page close to the page number "nearby". This can be used in an 4740** attempt to keep related pages close to each other in the database file, 4741** which in turn can make database access faster. 4742** 4743** If the "exact" parameter is not 0, and the page-number nearby exists 4744** anywhere on the free-list, then it is guarenteed to be returned. This 4745** is only used by auto-vacuum databases when allocating a new table. 4746*/ 4747static int allocateBtreePage( 4748 BtShared *pBt, 4749 MemPage **ppPage, 4750 Pgno *pPgno, 4751 Pgno nearby, 4752 u8 exact 4753){ 4754 MemPage *pPage1; 4755 int rc; 4756 u32 n; /* Number of pages on the freelist */ 4757 u32 k; /* Number of leaves on the trunk of the freelist */ 4758 MemPage *pTrunk = 0; 4759 MemPage *pPrevTrunk = 0; 4760 Pgno mxPage; /* Total size of the database file */ 4761 4762 assert( sqlite3_mutex_held(pBt->mutex) ); 4763 pPage1 = pBt->pPage1; 4764 mxPage = btreePagecount(pBt); 4765 n = get4byte(&pPage1->aData[36]); 4766 testcase( n==mxPage-1 ); 4767 if( n>=mxPage ){ 4768 return SQLITE_CORRUPT_BKPT; 4769 } 4770 if( n>0 ){ 4771 /* There are pages on the freelist. Reuse one of those pages. */ 4772 Pgno iTrunk; 4773 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 4774 4775 /* If the 'exact' parameter was true and a query of the pointer-map 4776 ** shows that the page 'nearby' is somewhere on the free-list, then 4777 ** the entire-list will be searched for that page. 4778 */ 4779#ifndef SQLITE_OMIT_AUTOVACUUM 4780 if( exact && nearby<=mxPage ){ 4781 u8 eType; 4782 assert( nearby>0 ); 4783 assert( pBt->autoVacuum ); 4784 rc = ptrmapGet(pBt, nearby, &eType, 0); 4785 if( rc ) return rc; 4786 if( eType==PTRMAP_FREEPAGE ){ 4787 searchList = 1; 4788 } 4789 *pPgno = nearby; 4790 } 4791#endif 4792 4793 /* Decrement the free-list count by 1. Set iTrunk to the index of the 4794 ** first free-list trunk page. iPrevTrunk is initially 1. 4795 */ 4796 rc = sqlite3PagerWrite(pPage1->pDbPage); 4797 if( rc ) return rc; 4798 put4byte(&pPage1->aData[36], n-1); 4799 4800 /* The code within this loop is run only once if the 'searchList' variable 4801 ** is not true. Otherwise, it runs once for each trunk-page on the 4802 ** free-list until the page 'nearby' is located. 4803 */ 4804 do { 4805 pPrevTrunk = pTrunk; 4806 if( pPrevTrunk ){ 4807 iTrunk = get4byte(&pPrevTrunk->aData[0]); 4808 }else{ 4809 iTrunk = get4byte(&pPage1->aData[32]); 4810 } 4811 testcase( iTrunk==mxPage ); 4812 if( iTrunk>mxPage ){ 4813 rc = SQLITE_CORRUPT_BKPT; 4814 }else{ 4815 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 4816 } 4817 if( rc ){ 4818 pTrunk = 0; 4819 goto end_allocate_page; 4820 } 4821 4822 k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */ 4823 if( k==0 && !searchList ){ 4824 /* The trunk has no leaves and the list is not being searched. 4825 ** So extract the trunk page itself and use it as the newly 4826 ** allocated page */ 4827 assert( pPrevTrunk==0 ); 4828 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4829 if( rc ){ 4830 goto end_allocate_page; 4831 } 4832 *pPgno = iTrunk; 4833 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4834 *ppPage = pTrunk; 4835 pTrunk = 0; 4836 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4837 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 4838 /* Value of k is out of range. Database corruption */ 4839 rc = SQLITE_CORRUPT_BKPT; 4840 goto end_allocate_page; 4841#ifndef SQLITE_OMIT_AUTOVACUUM 4842 }else if( searchList && nearby==iTrunk ){ 4843 /* The list is being searched and this trunk page is the page 4844 ** to allocate, regardless of whether it has leaves. 4845 */ 4846 assert( *pPgno==iTrunk ); 4847 *ppPage = pTrunk; 4848 searchList = 0; 4849 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4850 if( rc ){ 4851 goto end_allocate_page; 4852 } 4853 if( k==0 ){ 4854 if( !pPrevTrunk ){ 4855 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 4856 }else{ 4857 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4858 if( rc!=SQLITE_OK ){ 4859 goto end_allocate_page; 4860 } 4861 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 4862 } 4863 }else{ 4864 /* The trunk page is required by the caller but it contains 4865 ** pointers to free-list leaves. The first leaf becomes a trunk 4866 ** page in this case. 4867 */ 4868 MemPage *pNewTrunk; 4869 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 4870 if( iNewTrunk>mxPage ){ 4871 rc = SQLITE_CORRUPT_BKPT; 4872 goto end_allocate_page; 4873 } 4874 testcase( iNewTrunk==mxPage ); 4875 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 4876 if( rc!=SQLITE_OK ){ 4877 goto end_allocate_page; 4878 } 4879 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 4880 if( rc!=SQLITE_OK ){ 4881 releasePage(pNewTrunk); 4882 goto end_allocate_page; 4883 } 4884 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 4885 put4byte(&pNewTrunk->aData[4], k-1); 4886 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 4887 releasePage(pNewTrunk); 4888 if( !pPrevTrunk ){ 4889 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 4890 put4byte(&pPage1->aData[32], iNewTrunk); 4891 }else{ 4892 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 4893 if( rc ){ 4894 goto end_allocate_page; 4895 } 4896 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 4897 } 4898 } 4899 pTrunk = 0; 4900 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 4901#endif 4902 }else if( k>0 ){ 4903 /* Extract a leaf from the trunk */ 4904 u32 closest; 4905 Pgno iPage; 4906 unsigned char *aData = pTrunk->aData; 4907 if( nearby>0 ){ 4908 u32 i; 4909 int dist; 4910 closest = 0; 4911 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 4912 for(i=1; i<k; i++){ 4913 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 4914 if( d2<dist ){ 4915 closest = i; 4916 dist = d2; 4917 } 4918 } 4919 }else{ 4920 closest = 0; 4921 } 4922 4923 iPage = get4byte(&aData[8+closest*4]); 4924 testcase( iPage==mxPage ); 4925 if( iPage>mxPage ){ 4926 rc = SQLITE_CORRUPT_BKPT; 4927 goto end_allocate_page; 4928 } 4929 testcase( iPage==mxPage ); 4930 if( !searchList || iPage==nearby ){ 4931 int noContent; 4932 *pPgno = iPage; 4933 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 4934 ": %d more free pages\n", 4935 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 4936 rc = sqlite3PagerWrite(pTrunk->pDbPage); 4937 if( rc ) goto end_allocate_page; 4938 if( closest<k-1 ){ 4939 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 4940 } 4941 put4byte(&aData[4], k-1); 4942 noContent = !btreeGetHasContent(pBt, *pPgno); 4943 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); 4944 if( rc==SQLITE_OK ){ 4945 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4946 if( rc!=SQLITE_OK ){ 4947 releasePage(*ppPage); 4948 } 4949 } 4950 searchList = 0; 4951 } 4952 } 4953 releasePage(pPrevTrunk); 4954 pPrevTrunk = 0; 4955 }while( searchList ); 4956 }else{ 4957 /* There are no pages on the freelist, so create a new page at the 4958 ** end of the file */ 4959 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 4960 if( rc ) return rc; 4961 pBt->nPage++; 4962 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 4963 4964#ifndef SQLITE_OMIT_AUTOVACUUM 4965 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 4966 /* If *pPgno refers to a pointer-map page, allocate two new pages 4967 ** at the end of the file instead of one. The first allocated page 4968 ** becomes a new pointer-map page, the second is used by the caller. 4969 */ 4970 MemPage *pPg = 0; 4971 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage)); 4972 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 4973 rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1); 4974 if( rc==SQLITE_OK ){ 4975 rc = sqlite3PagerWrite(pPg->pDbPage); 4976 releasePage(pPg); 4977 } 4978 if( rc ) return rc; 4979 pBt->nPage++; 4980 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 4981 } 4982#endif 4983 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 4984 *pPgno = pBt->nPage; 4985 4986 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4987 rc = btreeGetPage(pBt, *pPgno, ppPage, 1); 4988 if( rc ) return rc; 4989 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 4990 if( rc!=SQLITE_OK ){ 4991 releasePage(*ppPage); 4992 } 4993 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 4994 } 4995 4996 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 4997 4998end_allocate_page: 4999 releasePage(pTrunk); 5000 releasePage(pPrevTrunk); 5001 if( rc==SQLITE_OK ){ 5002 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 5003 releasePage(*ppPage); 5004 return SQLITE_CORRUPT_BKPT; 5005 } 5006 (*ppPage)->isInit = 0; 5007 }else{ 5008 *ppPage = 0; 5009 } 5010 assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) ); 5011 return rc; 5012} 5013 5014/* 5015** This function is used to add page iPage to the database file free-list. 5016** It is assumed that the page is not already a part of the free-list. 5017** 5018** The value passed as the second argument to this function is optional. 5019** If the caller happens to have a pointer to the MemPage object 5020** corresponding to page iPage handy, it may pass it as the second value. 5021** Otherwise, it may pass NULL. 5022** 5023** If a pointer to a MemPage object is passed as the second argument, 5024** its reference count is not altered by this function. 5025*/ 5026static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 5027 MemPage *pTrunk = 0; /* Free-list trunk page */ 5028 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 5029 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 5030 MemPage *pPage; /* Page being freed. May be NULL. */ 5031 int rc; /* Return Code */ 5032 int nFree; /* Initial number of pages on free-list */ 5033 5034 assert( sqlite3_mutex_held(pBt->mutex) ); 5035 assert( iPage>1 ); 5036 assert( !pMemPage || pMemPage->pgno==iPage ); 5037 5038 if( pMemPage ){ 5039 pPage = pMemPage; 5040 sqlite3PagerRef(pPage->pDbPage); 5041 }else{ 5042 pPage = btreePageLookup(pBt, iPage); 5043 } 5044 5045 /* Increment the free page count on pPage1 */ 5046 rc = sqlite3PagerWrite(pPage1->pDbPage); 5047 if( rc ) goto freepage_out; 5048 nFree = get4byte(&pPage1->aData[36]); 5049 put4byte(&pPage1->aData[36], nFree+1); 5050 5051 if( pBt->secureDelete ){ 5052 /* If the secure_delete option is enabled, then 5053 ** always fully overwrite deleted information with zeros. 5054 */ 5055 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 5056 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 5057 ){ 5058 goto freepage_out; 5059 } 5060 memset(pPage->aData, 0, pPage->pBt->pageSize); 5061 } 5062 5063 /* If the database supports auto-vacuum, write an entry in the pointer-map 5064 ** to indicate that the page is free. 5065 */ 5066 if( ISAUTOVACUUM ){ 5067 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 5068 if( rc ) goto freepage_out; 5069 } 5070 5071 /* Now manipulate the actual database free-list structure. There are two 5072 ** possibilities. If the free-list is currently empty, or if the first 5073 ** trunk page in the free-list is full, then this page will become a 5074 ** new free-list trunk page. Otherwise, it will become a leaf of the 5075 ** first trunk page in the current free-list. This block tests if it 5076 ** is possible to add the page as a new free-list leaf. 5077 */ 5078 if( nFree!=0 ){ 5079 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 5080 5081 iTrunk = get4byte(&pPage1->aData[32]); 5082 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 5083 if( rc!=SQLITE_OK ){ 5084 goto freepage_out; 5085 } 5086 5087 nLeaf = get4byte(&pTrunk->aData[4]); 5088 assert( pBt->usableSize>32 ); 5089 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 5090 rc = SQLITE_CORRUPT_BKPT; 5091 goto freepage_out; 5092 } 5093 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 5094 /* In this case there is room on the trunk page to insert the page 5095 ** being freed as a new leaf. 5096 ** 5097 ** Note that the trunk page is not really full until it contains 5098 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 5099 ** coded. But due to a coding error in versions of SQLite prior to 5100 ** 3.6.0, databases with freelist trunk pages holding more than 5101 ** usableSize/4 - 8 entries will be reported as corrupt. In order 5102 ** to maintain backwards compatibility with older versions of SQLite, 5103 ** we will continue to restrict the number of entries to usableSize/4 - 8 5104 ** for now. At some point in the future (once everyone has upgraded 5105 ** to 3.6.0 or later) we should consider fixing the conditional above 5106 ** to read "usableSize/4-2" instead of "usableSize/4-8". 5107 */ 5108 rc = sqlite3PagerWrite(pTrunk->pDbPage); 5109 if( rc==SQLITE_OK ){ 5110 put4byte(&pTrunk->aData[4], nLeaf+1); 5111 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 5112 if( pPage && !pBt->secureDelete ){ 5113 sqlite3PagerDontWrite(pPage->pDbPage); 5114 } 5115 rc = btreeSetHasContent(pBt, iPage); 5116 } 5117 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 5118 goto freepage_out; 5119 } 5120 } 5121 5122 /* If control flows to this point, then it was not possible to add the 5123 ** the page being freed as a leaf page of the first trunk in the free-list. 5124 ** Possibly because the free-list is empty, or possibly because the 5125 ** first trunk in the free-list is full. Either way, the page being freed 5126 ** will become the new first trunk page in the free-list. 5127 */ 5128 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 5129 goto freepage_out; 5130 } 5131 rc = sqlite3PagerWrite(pPage->pDbPage); 5132 if( rc!=SQLITE_OK ){ 5133 goto freepage_out; 5134 } 5135 put4byte(pPage->aData, iTrunk); 5136 put4byte(&pPage->aData[4], 0); 5137 put4byte(&pPage1->aData[32], iPage); 5138 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); 5139 5140freepage_out: 5141 if( pPage ){ 5142 pPage->isInit = 0; 5143 } 5144 releasePage(pPage); 5145 releasePage(pTrunk); 5146 return rc; 5147} 5148static void freePage(MemPage *pPage, int *pRC){ 5149 if( (*pRC)==SQLITE_OK ){ 5150 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 5151 } 5152} 5153 5154/* 5155** Free any overflow pages associated with the given Cell. 5156*/ 5157static int clearCell(MemPage *pPage, unsigned char *pCell){ 5158 BtShared *pBt = pPage->pBt; 5159 CellInfo info; 5160 Pgno ovflPgno; 5161 int rc; 5162 int nOvfl; 5163 u32 ovflPageSize; 5164 5165 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5166 btreeParseCellPtr(pPage, pCell, &info); 5167 if( info.iOverflow==0 ){ 5168 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 5169 } 5170 ovflPgno = get4byte(&pCell[info.iOverflow]); 5171 assert( pBt->usableSize > 4 ); 5172 ovflPageSize = pBt->usableSize - 4; 5173 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 5174 assert( ovflPgno==0 || nOvfl>0 ); 5175 while( nOvfl-- ){ 5176 Pgno iNext = 0; 5177 MemPage *pOvfl = 0; 5178 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 5179 /* 0 is not a legal page number and page 1 cannot be an 5180 ** overflow page. Therefore if ovflPgno<2 or past the end of the 5181 ** file the database must be corrupt. */ 5182 return SQLITE_CORRUPT_BKPT; 5183 } 5184 if( nOvfl ){ 5185 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 5186 if( rc ) return rc; 5187 } 5188 5189 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 5190 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 5191 ){ 5192 /* There is no reason any cursor should have an outstanding reference 5193 ** to an overflow page belonging to a cell that is being deleted/updated. 5194 ** So if there exists more than one reference to this page, then it 5195 ** must not really be an overflow page and the database must be corrupt. 5196 ** It is helpful to detect this before calling freePage2(), as 5197 ** freePage2() may zero the page contents if secure-delete mode is 5198 ** enabled. If this 'overflow' page happens to be a page that the 5199 ** caller is iterating through or using in some other way, this 5200 ** can be problematic. 5201 */ 5202 rc = SQLITE_CORRUPT_BKPT; 5203 }else{ 5204 rc = freePage2(pBt, pOvfl, ovflPgno); 5205 } 5206 5207 if( pOvfl ){ 5208 sqlite3PagerUnref(pOvfl->pDbPage); 5209 } 5210 if( rc ) return rc; 5211 ovflPgno = iNext; 5212 } 5213 return SQLITE_OK; 5214} 5215 5216/* 5217** Create the byte sequence used to represent a cell on page pPage 5218** and write that byte sequence into pCell[]. Overflow pages are 5219** allocated and filled in as necessary. The calling procedure 5220** is responsible for making sure sufficient space has been allocated 5221** for pCell[]. 5222** 5223** Note that pCell does not necessary need to point to the pPage->aData 5224** area. pCell might point to some temporary storage. The cell will 5225** be constructed in this temporary area then copied into pPage->aData 5226** later. 5227*/ 5228static int fillInCell( 5229 MemPage *pPage, /* The page that contains the cell */ 5230 unsigned char *pCell, /* Complete text of the cell */ 5231 const void *pKey, i64 nKey, /* The key */ 5232 const void *pData,int nData, /* The data */ 5233 int nZero, /* Extra zero bytes to append to pData */ 5234 int *pnSize /* Write cell size here */ 5235){ 5236 int nPayload; 5237 const u8 *pSrc; 5238 int nSrc, n, rc; 5239 int spaceLeft; 5240 MemPage *pOvfl = 0; 5241 MemPage *pToRelease = 0; 5242 unsigned char *pPrior; 5243 unsigned char *pPayload; 5244 BtShared *pBt = pPage->pBt; 5245 Pgno pgnoOvfl = 0; 5246 int nHeader; 5247 CellInfo info; 5248 5249 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5250 5251 /* pPage is not necessarily writeable since pCell might be auxiliary 5252 ** buffer space that is separate from the pPage buffer area */ 5253 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize] 5254 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5255 5256 /* Fill in the header. */ 5257 nHeader = 0; 5258 if( !pPage->leaf ){ 5259 nHeader += 4; 5260 } 5261 if( pPage->hasData ){ 5262 nHeader += putVarint(&pCell[nHeader], nData+nZero); 5263 }else{ 5264 nData = nZero = 0; 5265 } 5266 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 5267 btreeParseCellPtr(pPage, pCell, &info); 5268 assert( info.nHeader==nHeader ); 5269 assert( info.nKey==nKey ); 5270 assert( info.nData==(u32)(nData+nZero) ); 5271 5272 /* Fill in the payload */ 5273 nPayload = nData + nZero; 5274 if( pPage->intKey ){ 5275 pSrc = pData; 5276 nSrc = nData; 5277 nData = 0; 5278 }else{ 5279 if( NEVER(nKey>0x7fffffff || pKey==0) ){ 5280 return SQLITE_CORRUPT_BKPT; 5281 } 5282 nPayload += (int)nKey; 5283 pSrc = pKey; 5284 nSrc = (int)nKey; 5285 } 5286 *pnSize = info.nSize; 5287 spaceLeft = info.nLocal; 5288 pPayload = &pCell[nHeader]; 5289 pPrior = &pCell[info.iOverflow]; 5290 5291 while( nPayload>0 ){ 5292 if( spaceLeft==0 ){ 5293#ifndef SQLITE_OMIT_AUTOVACUUM 5294 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 5295 if( pBt->autoVacuum ){ 5296 do{ 5297 pgnoOvfl++; 5298 } while( 5299 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 5300 ); 5301 } 5302#endif 5303 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 5304#ifndef SQLITE_OMIT_AUTOVACUUM 5305 /* If the database supports auto-vacuum, and the second or subsequent 5306 ** overflow page is being allocated, add an entry to the pointer-map 5307 ** for that page now. 5308 ** 5309 ** If this is the first overflow page, then write a partial entry 5310 ** to the pointer-map. If we write nothing to this pointer-map slot, 5311 ** then the optimistic overflow chain processing in clearCell() 5312 ** may misinterpret the uninitialised values and delete the 5313 ** wrong pages from the database. 5314 */ 5315 if( pBt->autoVacuum && rc==SQLITE_OK ){ 5316 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 5317 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 5318 if( rc ){ 5319 releasePage(pOvfl); 5320 } 5321 } 5322#endif 5323 if( rc ){ 5324 releasePage(pToRelease); 5325 return rc; 5326 } 5327 5328 /* If pToRelease is not zero than pPrior points into the data area 5329 ** of pToRelease. Make sure pToRelease is still writeable. */ 5330 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5331 5332 /* If pPrior is part of the data area of pPage, then make sure pPage 5333 ** is still writeable */ 5334 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 5335 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5336 5337 put4byte(pPrior, pgnoOvfl); 5338 releasePage(pToRelease); 5339 pToRelease = pOvfl; 5340 pPrior = pOvfl->aData; 5341 put4byte(pPrior, 0); 5342 pPayload = &pOvfl->aData[4]; 5343 spaceLeft = pBt->usableSize - 4; 5344 } 5345 n = nPayload; 5346 if( n>spaceLeft ) n = spaceLeft; 5347 5348 /* If pToRelease is not zero than pPayload points into the data area 5349 ** of pToRelease. Make sure pToRelease is still writeable. */ 5350 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5351 5352 /* If pPayload is part of the data area of pPage, then make sure pPage 5353 ** is still writeable */ 5354 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 5355 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5356 5357 if( nSrc>0 ){ 5358 if( n>nSrc ) n = nSrc; 5359 assert( pSrc ); 5360 memcpy(pPayload, pSrc, n); 5361 }else{ 5362 memset(pPayload, 0, n); 5363 } 5364 nPayload -= n; 5365 pPayload += n; 5366 pSrc += n; 5367 nSrc -= n; 5368 spaceLeft -= n; 5369 if( nSrc==0 ){ 5370 nSrc = nData; 5371 pSrc = pData; 5372 } 5373 } 5374 releasePage(pToRelease); 5375 return SQLITE_OK; 5376} 5377 5378/* 5379** Remove the i-th cell from pPage. This routine effects pPage only. 5380** The cell content is not freed or deallocated. It is assumed that 5381** the cell content has been copied someplace else. This routine just 5382** removes the reference to the cell from pPage. 5383** 5384** "sz" must be the number of bytes in the cell. 5385*/ 5386static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 5387 int i; /* Loop counter */ 5388 u32 pc; /* Offset to cell content of cell being deleted */ 5389 u8 *data; /* pPage->aData */ 5390 u8 *ptr; /* Used to move bytes around within data[] */ 5391 int rc; /* The return code */ 5392 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 5393 5394 if( *pRC ) return; 5395 5396 assert( idx>=0 && idx<pPage->nCell ); 5397 assert( sz==cellSize(pPage, idx) ); 5398 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5399 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5400 data = pPage->aData; 5401 ptr = &data[pPage->cellOffset + 2*idx]; 5402 pc = get2byte(ptr); 5403 hdr = pPage->hdrOffset; 5404 testcase( pc==get2byte(&data[hdr+5]) ); 5405 testcase( pc+sz==pPage->pBt->usableSize ); 5406 if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){ 5407 *pRC = SQLITE_CORRUPT_BKPT; 5408 return; 5409 } 5410 rc = freeSpace(pPage, pc, sz); 5411 if( rc ){ 5412 *pRC = rc; 5413 return; 5414 } 5415 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ 5416 ptr[0] = ptr[2]; 5417 ptr[1] = ptr[3]; 5418 } 5419 pPage->nCell--; 5420 put2byte(&data[hdr+3], pPage->nCell); 5421 pPage->nFree += 2; 5422} 5423 5424/* 5425** Insert a new cell on pPage at cell index "i". pCell points to the 5426** content of the cell. 5427** 5428** If the cell content will fit on the page, then put it there. If it 5429** will not fit, then make a copy of the cell content into pTemp if 5430** pTemp is not null. Regardless of pTemp, allocate a new entry 5431** in pPage->aOvfl[] and make it point to the cell content (either 5432** in pTemp or the original pCell) and also record its index. 5433** Allocating a new entry in pPage->aCell[] implies that 5434** pPage->nOverflow is incremented. 5435** 5436** If nSkip is non-zero, then do not copy the first nSkip bytes of the 5437** cell. The caller will overwrite them after this function returns. If 5438** nSkip is non-zero, then pCell may not point to an invalid memory location 5439** (but pCell+nSkip is always valid). 5440*/ 5441static void insertCell( 5442 MemPage *pPage, /* Page into which we are copying */ 5443 int i, /* New cell becomes the i-th cell of the page */ 5444 u8 *pCell, /* Content of the new cell */ 5445 int sz, /* Bytes of content in pCell */ 5446 u8 *pTemp, /* Temp storage space for pCell, if needed */ 5447 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */ 5448 int *pRC /* Read and write return code from here */ 5449){ 5450 int idx = 0; /* Where to write new cell content in data[] */ 5451 int j; /* Loop counter */ 5452 int end; /* First byte past the last cell pointer in data[] */ 5453 int ins; /* Index in data[] where new cell pointer is inserted */ 5454 int cellOffset; /* Address of first cell pointer in data[] */ 5455 u8 *data; /* The content of the whole page */ 5456 u8 *ptr; /* Used for moving information around in data[] */ 5457 5458 int nSkip = (iChild ? 4 : 0); 5459 5460 if( *pRC ) return; 5461 5462 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 5463 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 ); 5464 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) ); 5465 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5466 /* The cell should normally be sized correctly. However, when moving a 5467 ** malformed cell from a leaf page to an interior page, if the cell size 5468 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size 5469 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence 5470 ** the term after the || in the following assert(). */ 5471 assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) ); 5472 if( pPage->nOverflow || sz+2>pPage->nFree ){ 5473 if( pTemp ){ 5474 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 5475 pCell = pTemp; 5476 } 5477 if( iChild ){ 5478 put4byte(pCell, iChild); 5479 } 5480 j = pPage->nOverflow++; 5481 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) ); 5482 pPage->aOvfl[j].pCell = pCell; 5483 pPage->aOvfl[j].idx = (u16)i; 5484 }else{ 5485 int rc = sqlite3PagerWrite(pPage->pDbPage); 5486 if( rc!=SQLITE_OK ){ 5487 *pRC = rc; 5488 return; 5489 } 5490 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5491 data = pPage->aData; 5492 cellOffset = pPage->cellOffset; 5493 end = cellOffset + 2*pPage->nCell; 5494 ins = cellOffset + 2*i; 5495 rc = allocateSpace(pPage, sz, &idx); 5496 if( rc ){ *pRC = rc; return; } 5497 /* The allocateSpace() routine guarantees the following two properties 5498 ** if it returns success */ 5499 assert( idx >= end+2 ); 5500 assert( idx+sz <= (int)pPage->pBt->usableSize ); 5501 pPage->nCell++; 5502 pPage->nFree -= (u16)(2 + sz); 5503 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 5504 if( iChild ){ 5505 put4byte(&data[idx], iChild); 5506 } 5507 for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){ 5508 ptr[0] = ptr[-2]; 5509 ptr[1] = ptr[-1]; 5510 } 5511 put2byte(&data[ins], idx); 5512 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 5513#ifndef SQLITE_OMIT_AUTOVACUUM 5514 if( pPage->pBt->autoVacuum ){ 5515 /* The cell may contain a pointer to an overflow page. If so, write 5516 ** the entry for the overflow page into the pointer map. 5517 */ 5518 ptrmapPutOvflPtr(pPage, pCell, pRC); 5519 } 5520#endif 5521 } 5522} 5523 5524/* 5525** Add a list of cells to a page. The page should be initially empty. 5526** The cells are guaranteed to fit on the page. 5527*/ 5528static void assemblePage( 5529 MemPage *pPage, /* The page to be assemblied */ 5530 int nCell, /* The number of cells to add to this page */ 5531 u8 **apCell, /* Pointers to cell bodies */ 5532 u16 *aSize /* Sizes of the cells */ 5533){ 5534 int i; /* Loop counter */ 5535 u8 *pCellptr; /* Address of next cell pointer */ 5536 int cellbody; /* Address of next cell body */ 5537 u8 * const data = pPage->aData; /* Pointer to data for pPage */ 5538 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ 5539 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ 5540 5541 assert( pPage->nOverflow==0 ); 5542 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5543 assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt) 5544 && (int)MX_CELL(pPage->pBt)<=10921); 5545 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5546 5547 /* Check that the page has just been zeroed by zeroPage() */ 5548 assert( pPage->nCell==0 ); 5549 assert( get2byteNotZero(&data[hdr+5])==nUsable ); 5550 5551 pCellptr = &data[pPage->cellOffset + nCell*2]; 5552 cellbody = nUsable; 5553 for(i=nCell-1; i>=0; i--){ 5554 pCellptr -= 2; 5555 cellbody -= aSize[i]; 5556 put2byte(pCellptr, cellbody); 5557 memcpy(&data[cellbody], apCell[i], aSize[i]); 5558 } 5559 put2byte(&data[hdr+3], nCell); 5560 put2byte(&data[hdr+5], cellbody); 5561 pPage->nFree -= (nCell*2 + nUsable - cellbody); 5562 pPage->nCell = (u16)nCell; 5563} 5564 5565/* 5566** The following parameters determine how many adjacent pages get involved 5567** in a balancing operation. NN is the number of neighbors on either side 5568** of the page that participate in the balancing operation. NB is the 5569** total number of pages that participate, including the target page and 5570** NN neighbors on either side. 5571** 5572** The minimum value of NN is 1 (of course). Increasing NN above 1 5573** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 5574** in exchange for a larger degradation in INSERT and UPDATE performance. 5575** The value of NN appears to give the best results overall. 5576*/ 5577#define NN 1 /* Number of neighbors on either side of pPage */ 5578#define NB (NN*2+1) /* Total pages involved in the balance */ 5579 5580 5581#ifndef SQLITE_OMIT_QUICKBALANCE 5582/* 5583** This version of balance() handles the common special case where 5584** a new entry is being inserted on the extreme right-end of the 5585** tree, in other words, when the new entry will become the largest 5586** entry in the tree. 5587** 5588** Instead of trying to balance the 3 right-most leaf pages, just add 5589** a new page to the right-hand side and put the one new entry in 5590** that page. This leaves the right side of the tree somewhat 5591** unbalanced. But odds are that we will be inserting new entries 5592** at the end soon afterwards so the nearly empty page will quickly 5593** fill up. On average. 5594** 5595** pPage is the leaf page which is the right-most page in the tree. 5596** pParent is its parent. pPage must have a single overflow entry 5597** which is also the right-most entry on the page. 5598** 5599** The pSpace buffer is used to store a temporary copy of the divider 5600** cell that will be inserted into pParent. Such a cell consists of a 4 5601** byte page number followed by a variable length integer. In other 5602** words, at most 13 bytes. Hence the pSpace buffer must be at 5603** least 13 bytes in size. 5604*/ 5605static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 5606 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 5607 MemPage *pNew; /* Newly allocated page */ 5608 int rc; /* Return Code */ 5609 Pgno pgnoNew; /* Page number of pNew */ 5610 5611 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5612 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5613 assert( pPage->nOverflow==1 ); 5614 5615 /* This error condition is now caught prior to reaching this function */ 5616 if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT; 5617 5618 /* Allocate a new page. This page will become the right-sibling of 5619 ** pPage. Make the parent page writable, so that the new divider cell 5620 ** may be inserted. If both these operations are successful, proceed. 5621 */ 5622 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 5623 5624 if( rc==SQLITE_OK ){ 5625 5626 u8 *pOut = &pSpace[4]; 5627 u8 *pCell = pPage->aOvfl[0].pCell; 5628 u16 szCell = cellSizePtr(pPage, pCell); 5629 u8 *pStop; 5630 5631 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 5632 assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 5633 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 5634 assemblePage(pNew, 1, &pCell, &szCell); 5635 5636 /* If this is an auto-vacuum database, update the pointer map 5637 ** with entries for the new page, and any pointer from the 5638 ** cell on the page to an overflow page. If either of these 5639 ** operations fails, the return code is set, but the contents 5640 ** of the parent page are still manipulated by thh code below. 5641 ** That is Ok, at this point the parent page is guaranteed to 5642 ** be marked as dirty. Returning an error code will cause a 5643 ** rollback, undoing any changes made to the parent page. 5644 */ 5645 if( ISAUTOVACUUM ){ 5646 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 5647 if( szCell>pNew->minLocal ){ 5648 ptrmapPutOvflPtr(pNew, pCell, &rc); 5649 } 5650 } 5651 5652 /* Create a divider cell to insert into pParent. The divider cell 5653 ** consists of a 4-byte page number (the page number of pPage) and 5654 ** a variable length key value (which must be the same value as the 5655 ** largest key on pPage). 5656 ** 5657 ** To find the largest key value on pPage, first find the right-most 5658 ** cell on pPage. The first two fields of this cell are the 5659 ** record-length (a variable length integer at most 32-bits in size) 5660 ** and the key value (a variable length integer, may have any value). 5661 ** The first of the while(...) loops below skips over the record-length 5662 ** field. The second while(...) loop copies the key value from the 5663 ** cell on pPage into the pSpace buffer. 5664 */ 5665 pCell = findCell(pPage, pPage->nCell-1); 5666 pStop = &pCell[9]; 5667 while( (*(pCell++)&0x80) && pCell<pStop ); 5668 pStop = &pCell[9]; 5669 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 5670 5671 /* Insert the new divider cell into pParent. */ 5672 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 5673 0, pPage->pgno, &rc); 5674 5675 /* Set the right-child pointer of pParent to point to the new page. */ 5676 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 5677 5678 /* Release the reference to the new page. */ 5679 releasePage(pNew); 5680 } 5681 5682 return rc; 5683} 5684#endif /* SQLITE_OMIT_QUICKBALANCE */ 5685 5686#if 0 5687/* 5688** This function does not contribute anything to the operation of SQLite. 5689** it is sometimes activated temporarily while debugging code responsible 5690** for setting pointer-map entries. 5691*/ 5692static int ptrmapCheckPages(MemPage **apPage, int nPage){ 5693 int i, j; 5694 for(i=0; i<nPage; i++){ 5695 Pgno n; 5696 u8 e; 5697 MemPage *pPage = apPage[i]; 5698 BtShared *pBt = pPage->pBt; 5699 assert( pPage->isInit ); 5700 5701 for(j=0; j<pPage->nCell; j++){ 5702 CellInfo info; 5703 u8 *z; 5704 5705 z = findCell(pPage, j); 5706 btreeParseCellPtr(pPage, z, &info); 5707 if( info.iOverflow ){ 5708 Pgno ovfl = get4byte(&z[info.iOverflow]); 5709 ptrmapGet(pBt, ovfl, &e, &n); 5710 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 5711 } 5712 if( !pPage->leaf ){ 5713 Pgno child = get4byte(z); 5714 ptrmapGet(pBt, child, &e, &n); 5715 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 5716 } 5717 } 5718 if( !pPage->leaf ){ 5719 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 5720 ptrmapGet(pBt, child, &e, &n); 5721 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 5722 } 5723 } 5724 return 1; 5725} 5726#endif 5727 5728/* 5729** This function is used to copy the contents of the b-tree node stored 5730** on page pFrom to page pTo. If page pFrom was not a leaf page, then 5731** the pointer-map entries for each child page are updated so that the 5732** parent page stored in the pointer map is page pTo. If pFrom contained 5733** any cells with overflow page pointers, then the corresponding pointer 5734** map entries are also updated so that the parent page is page pTo. 5735** 5736** If pFrom is currently carrying any overflow cells (entries in the 5737** MemPage.aOvfl[] array), they are not copied to pTo. 5738** 5739** Before returning, page pTo is reinitialized using btreeInitPage(). 5740** 5741** The performance of this function is not critical. It is only used by 5742** the balance_shallower() and balance_deeper() procedures, neither of 5743** which are called often under normal circumstances. 5744*/ 5745static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 5746 if( (*pRC)==SQLITE_OK ){ 5747 BtShared * const pBt = pFrom->pBt; 5748 u8 * const aFrom = pFrom->aData; 5749 u8 * const aTo = pTo->aData; 5750 int const iFromHdr = pFrom->hdrOffset; 5751 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 5752 int rc; 5753 int iData; 5754 5755 5756 assert( pFrom->isInit ); 5757 assert( pFrom->nFree>=iToHdr ); 5758 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 5759 5760 /* Copy the b-tree node content from page pFrom to page pTo. */ 5761 iData = get2byte(&aFrom[iFromHdr+5]); 5762 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 5763 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 5764 5765 /* Reinitialize page pTo so that the contents of the MemPage structure 5766 ** match the new data. The initialization of pTo can actually fail under 5767 ** fairly obscure circumstances, even though it is a copy of initialized 5768 ** page pFrom. 5769 */ 5770 pTo->isInit = 0; 5771 rc = btreeInitPage(pTo); 5772 if( rc!=SQLITE_OK ){ 5773 *pRC = rc; 5774 return; 5775 } 5776 5777 /* If this is an auto-vacuum database, update the pointer-map entries 5778 ** for any b-tree or overflow pages that pTo now contains the pointers to. 5779 */ 5780 if( ISAUTOVACUUM ){ 5781 *pRC = setChildPtrmaps(pTo); 5782 } 5783 } 5784} 5785 5786/* 5787** This routine redistributes cells on the iParentIdx'th child of pParent 5788** (hereafter "the page") and up to 2 siblings so that all pages have about the 5789** same amount of free space. Usually a single sibling on either side of the 5790** page are used in the balancing, though both siblings might come from one 5791** side if the page is the first or last child of its parent. If the page 5792** has fewer than 2 siblings (something which can only happen if the page 5793** is a root page or a child of a root page) then all available siblings 5794** participate in the balancing. 5795** 5796** The number of siblings of the page might be increased or decreased by 5797** one or two in an effort to keep pages nearly full but not over full. 5798** 5799** Note that when this routine is called, some of the cells on the page 5800** might not actually be stored in MemPage.aData[]. This can happen 5801** if the page is overfull. This routine ensures that all cells allocated 5802** to the page and its siblings fit into MemPage.aData[] before returning. 5803** 5804** In the course of balancing the page and its siblings, cells may be 5805** inserted into or removed from the parent page (pParent). Doing so 5806** may cause the parent page to become overfull or underfull. If this 5807** happens, it is the responsibility of the caller to invoke the correct 5808** balancing routine to fix this problem (see the balance() routine). 5809** 5810** If this routine fails for any reason, it might leave the database 5811** in a corrupted state. So if this routine fails, the database should 5812** be rolled back. 5813** 5814** The third argument to this function, aOvflSpace, is a pointer to a 5815** buffer big enough to hold one page. If while inserting cells into the parent 5816** page (pParent) the parent page becomes overfull, this buffer is 5817** used to store the parent's overflow cells. Because this function inserts 5818** a maximum of four divider cells into the parent page, and the maximum 5819** size of a cell stored within an internal node is always less than 1/4 5820** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 5821** enough for all overflow cells. 5822** 5823** If aOvflSpace is set to a null pointer, this function returns 5824** SQLITE_NOMEM. 5825*/ 5826static int balance_nonroot( 5827 MemPage *pParent, /* Parent page of siblings being balanced */ 5828 int iParentIdx, /* Index of "the page" in pParent */ 5829 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 5830 int isRoot /* True if pParent is a root-page */ 5831){ 5832 BtShared *pBt; /* The whole database */ 5833 int nCell = 0; /* Number of cells in apCell[] */ 5834 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 5835 int nNew = 0; /* Number of pages in apNew[] */ 5836 int nOld; /* Number of pages in apOld[] */ 5837 int i, j, k; /* Loop counters */ 5838 int nxDiv; /* Next divider slot in pParent->aCell[] */ 5839 int rc = SQLITE_OK; /* The return code */ 5840 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 5841 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 5842 int usableSpace; /* Bytes in pPage beyond the header */ 5843 int pageFlags; /* Value of pPage->aData[0] */ 5844 int subtotal; /* Subtotal of bytes in cells on one page */ 5845 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 5846 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 5847 int szScratch; /* Size of scratch memory requested */ 5848 MemPage *apOld[NB]; /* pPage and up to two siblings */ 5849 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 5850 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 5851 u8 *pRight; /* Location in parent of right-sibling pointer */ 5852 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 5853 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 5854 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 5855 u8 **apCell = 0; /* All cells begin balanced */ 5856 u16 *szCell; /* Local size of all cells in apCell[] */ 5857 u8 *aSpace1; /* Space for copies of dividers cells */ 5858 Pgno pgno; /* Temp var to store a page number in */ 5859 5860 pBt = pParent->pBt; 5861 assert( sqlite3_mutex_held(pBt->mutex) ); 5862 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 5863 5864#if 0 5865 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 5866#endif 5867 5868 /* At this point pParent may have at most one overflow cell. And if 5869 ** this overflow cell is present, it must be the cell with 5870 ** index iParentIdx. This scenario comes about when this function 5871 ** is called (indirectly) from sqlite3BtreeDelete(). 5872 */ 5873 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 5874 assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx ); 5875 5876 if( !aOvflSpace ){ 5877 return SQLITE_NOMEM; 5878 } 5879 5880 /* Find the sibling pages to balance. Also locate the cells in pParent 5881 ** that divide the siblings. An attempt is made to find NN siblings on 5882 ** either side of pPage. More siblings are taken from one side, however, 5883 ** if there are fewer than NN siblings on the other side. If pParent 5884 ** has NB or fewer children then all children of pParent are taken. 5885 ** 5886 ** This loop also drops the divider cells from the parent page. This 5887 ** way, the remainder of the function does not have to deal with any 5888 ** overflow cells in the parent page, since if any existed they will 5889 ** have already been removed. 5890 */ 5891 i = pParent->nOverflow + pParent->nCell; 5892 if( i<2 ){ 5893 nxDiv = 0; 5894 nOld = i+1; 5895 }else{ 5896 nOld = 3; 5897 if( iParentIdx==0 ){ 5898 nxDiv = 0; 5899 }else if( iParentIdx==i ){ 5900 nxDiv = i-2; 5901 }else{ 5902 nxDiv = iParentIdx-1; 5903 } 5904 i = 2; 5905 } 5906 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 5907 pRight = &pParent->aData[pParent->hdrOffset+8]; 5908 }else{ 5909 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 5910 } 5911 pgno = get4byte(pRight); 5912 while( 1 ){ 5913 rc = getAndInitPage(pBt, pgno, &apOld[i]); 5914 if( rc ){ 5915 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 5916 goto balance_cleanup; 5917 } 5918 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 5919 if( (i--)==0 ) break; 5920 5921 if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){ 5922 apDiv[i] = pParent->aOvfl[0].pCell; 5923 pgno = get4byte(apDiv[i]); 5924 szNew[i] = cellSizePtr(pParent, apDiv[i]); 5925 pParent->nOverflow = 0; 5926 }else{ 5927 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 5928 pgno = get4byte(apDiv[i]); 5929 szNew[i] = cellSizePtr(pParent, apDiv[i]); 5930 5931 /* Drop the cell from the parent page. apDiv[i] still points to 5932 ** the cell within the parent, even though it has been dropped. 5933 ** This is safe because dropping a cell only overwrites the first 5934 ** four bytes of it, and this function does not need the first 5935 ** four bytes of the divider cell. So the pointer is safe to use 5936 ** later on. 5937 ** 5938 ** Unless SQLite is compiled in secure-delete mode. In this case, 5939 ** the dropCell() routine will overwrite the entire cell with zeroes. 5940 ** In this case, temporarily copy the cell into the aOvflSpace[] 5941 ** buffer. It will be copied out again as soon as the aSpace[] buffer 5942 ** is allocated. */ 5943 if( pBt->secureDelete ){ 5944 int iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 5945 if( (iOff+szNew[i])>(int)pBt->usableSize ){ 5946 rc = SQLITE_CORRUPT_BKPT; 5947 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 5948 goto balance_cleanup; 5949 }else{ 5950 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 5951 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 5952 } 5953 } 5954 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 5955 } 5956 } 5957 5958 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 5959 ** alignment */ 5960 nMaxCells = (nMaxCells + 3)&~3; 5961 5962 /* 5963 ** Allocate space for memory structures 5964 */ 5965 k = pBt->pageSize + ROUND8(sizeof(MemPage)); 5966 szScratch = 5967 nMaxCells*sizeof(u8*) /* apCell */ 5968 + nMaxCells*sizeof(u16) /* szCell */ 5969 + pBt->pageSize /* aSpace1 */ 5970 + k*nOld; /* Page copies (apCopy) */ 5971 apCell = sqlite3ScratchMalloc( szScratch ); 5972 if( apCell==0 ){ 5973 rc = SQLITE_NOMEM; 5974 goto balance_cleanup; 5975 } 5976 szCell = (u16*)&apCell[nMaxCells]; 5977 aSpace1 = (u8*)&szCell[nMaxCells]; 5978 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 5979 5980 /* 5981 ** Load pointers to all cells on sibling pages and the divider cells 5982 ** into the local apCell[] array. Make copies of the divider cells 5983 ** into space obtained from aSpace1[] and remove the the divider Cells 5984 ** from pParent. 5985 ** 5986 ** If the siblings are on leaf pages, then the child pointers of the 5987 ** divider cells are stripped from the cells before they are copied 5988 ** into aSpace1[]. In this way, all cells in apCell[] are without 5989 ** child pointers. If siblings are not leaves, then all cell in 5990 ** apCell[] include child pointers. Either way, all cells in apCell[] 5991 ** are alike. 5992 ** 5993 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 5994 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 5995 */ 5996 leafCorrection = apOld[0]->leaf*4; 5997 leafData = apOld[0]->hasData; 5998 for(i=0; i<nOld; i++){ 5999 int limit; 6000 6001 /* Before doing anything else, take a copy of the i'th original sibling 6002 ** The rest of this function will use data from the copies rather 6003 ** that the original pages since the original pages will be in the 6004 ** process of being overwritten. */ 6005 MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; 6006 memcpy(pOld, apOld[i], sizeof(MemPage)); 6007 pOld->aData = (void*)&pOld[1]; 6008 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); 6009 6010 limit = pOld->nCell+pOld->nOverflow; 6011 for(j=0; j<limit; j++){ 6012 assert( nCell<nMaxCells ); 6013 apCell[nCell] = findOverflowCell(pOld, j); 6014 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 6015 nCell++; 6016 } 6017 if( i<nOld-1 && !leafData){ 6018 u16 sz = (u16)szNew[i]; 6019 u8 *pTemp; 6020 assert( nCell<nMaxCells ); 6021 szCell[nCell] = sz; 6022 pTemp = &aSpace1[iSpace1]; 6023 iSpace1 += sz; 6024 assert( sz<=pBt->maxLocal+23 ); 6025 assert( iSpace1 <= (int)pBt->pageSize ); 6026 memcpy(pTemp, apDiv[i], sz); 6027 apCell[nCell] = pTemp+leafCorrection; 6028 assert( leafCorrection==0 || leafCorrection==4 ); 6029 szCell[nCell] = szCell[nCell] - leafCorrection; 6030 if( !pOld->leaf ){ 6031 assert( leafCorrection==0 ); 6032 assert( pOld->hdrOffset==0 ); 6033 /* The right pointer of the child page pOld becomes the left 6034 ** pointer of the divider cell */ 6035 memcpy(apCell[nCell], &pOld->aData[8], 4); 6036 }else{ 6037 assert( leafCorrection==4 ); 6038 if( szCell[nCell]<4 ){ 6039 /* Do not allow any cells smaller than 4 bytes. */ 6040 szCell[nCell] = 4; 6041 } 6042 } 6043 nCell++; 6044 } 6045 } 6046 6047 /* 6048 ** Figure out the number of pages needed to hold all nCell cells. 6049 ** Store this number in "k". Also compute szNew[] which is the total 6050 ** size of all cells on the i-th page and cntNew[] which is the index 6051 ** in apCell[] of the cell that divides page i from page i+1. 6052 ** cntNew[k] should equal nCell. 6053 ** 6054 ** Values computed by this block: 6055 ** 6056 ** k: The total number of sibling pages 6057 ** szNew[i]: Spaced used on the i-th sibling page. 6058 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 6059 ** the right of the i-th sibling page. 6060 ** usableSpace: Number of bytes of space available on each sibling. 6061 ** 6062 */ 6063 usableSpace = pBt->usableSize - 12 + leafCorrection; 6064 for(subtotal=k=i=0; i<nCell; i++){ 6065 assert( i<nMaxCells ); 6066 subtotal += szCell[i] + 2; 6067 if( subtotal > usableSpace ){ 6068 szNew[k] = subtotal - szCell[i]; 6069 cntNew[k] = i; 6070 if( leafData ){ i--; } 6071 subtotal = 0; 6072 k++; 6073 if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 6074 } 6075 } 6076 szNew[k] = subtotal; 6077 cntNew[k] = nCell; 6078 k++; 6079 6080 /* 6081 ** The packing computed by the previous block is biased toward the siblings 6082 ** on the left side. The left siblings are always nearly full, while the 6083 ** right-most sibling might be nearly empty. This block of code attempts 6084 ** to adjust the packing of siblings to get a better balance. 6085 ** 6086 ** This adjustment is more than an optimization. The packing above might 6087 ** be so out of balance as to be illegal. For example, the right-most 6088 ** sibling might be completely empty. This adjustment is not optional. 6089 */ 6090 for(i=k-1; i>0; i--){ 6091 int szRight = szNew[i]; /* Size of sibling on the right */ 6092 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 6093 int r; /* Index of right-most cell in left sibling */ 6094 int d; /* Index of first cell to the left of right sibling */ 6095 6096 r = cntNew[i-1] - 1; 6097 d = r + 1 - leafData; 6098 assert( d<nMaxCells ); 6099 assert( r<nMaxCells ); 6100 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ 6101 szRight += szCell[d] + 2; 6102 szLeft -= szCell[r] + 2; 6103 cntNew[i-1]--; 6104 r = cntNew[i-1] - 1; 6105 d = r + 1 - leafData; 6106 } 6107 szNew[i] = szRight; 6108 szNew[i-1] = szLeft; 6109 } 6110 6111 /* Either we found one or more cells (cntnew[0])>0) or pPage is 6112 ** a virtual root page. A virtual root page is when the real root 6113 ** page is page 1 and we are the only child of that page. 6114 */ 6115 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 6116 6117 TRACE(("BALANCE: old: %d %d %d ", 6118 apOld[0]->pgno, 6119 nOld>=2 ? apOld[1]->pgno : 0, 6120 nOld>=3 ? apOld[2]->pgno : 0 6121 )); 6122 6123 /* 6124 ** Allocate k new pages. Reuse old pages where possible. 6125 */ 6126 if( apOld[0]->pgno<=1 ){ 6127 rc = SQLITE_CORRUPT_BKPT; 6128 goto balance_cleanup; 6129 } 6130 pageFlags = apOld[0]->aData[0]; 6131 for(i=0; i<k; i++){ 6132 MemPage *pNew; 6133 if( i<nOld ){ 6134 pNew = apNew[i] = apOld[i]; 6135 apOld[i] = 0; 6136 rc = sqlite3PagerWrite(pNew->pDbPage); 6137 nNew++; 6138 if( rc ) goto balance_cleanup; 6139 }else{ 6140 assert( i>0 ); 6141 rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0); 6142 if( rc ) goto balance_cleanup; 6143 apNew[i] = pNew; 6144 nNew++; 6145 6146 /* Set the pointer-map entry for the new sibling page. */ 6147 if( ISAUTOVACUUM ){ 6148 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 6149 if( rc!=SQLITE_OK ){ 6150 goto balance_cleanup; 6151 } 6152 } 6153 } 6154 } 6155 6156 /* Free any old pages that were not reused as new pages. 6157 */ 6158 while( i<nOld ){ 6159 freePage(apOld[i], &rc); 6160 if( rc ) goto balance_cleanup; 6161 releasePage(apOld[i]); 6162 apOld[i] = 0; 6163 i++; 6164 } 6165 6166 /* 6167 ** Put the new pages in accending order. This helps to 6168 ** keep entries in the disk file in order so that a scan 6169 ** of the table is a linear scan through the file. That 6170 ** in turn helps the operating system to deliver pages 6171 ** from the disk more rapidly. 6172 ** 6173 ** An O(n^2) insertion sort algorithm is used, but since 6174 ** n is never more than NB (a small constant), that should 6175 ** not be a problem. 6176 ** 6177 ** When NB==3, this one optimization makes the database 6178 ** about 25% faster for large insertions and deletions. 6179 */ 6180 for(i=0; i<k-1; i++){ 6181 int minV = apNew[i]->pgno; 6182 int minI = i; 6183 for(j=i+1; j<k; j++){ 6184 if( apNew[j]->pgno<(unsigned)minV ){ 6185 minI = j; 6186 minV = apNew[j]->pgno; 6187 } 6188 } 6189 if( minI>i ){ 6190 MemPage *pT; 6191 pT = apNew[i]; 6192 apNew[i] = apNew[minI]; 6193 apNew[minI] = pT; 6194 } 6195 } 6196 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 6197 apNew[0]->pgno, szNew[0], 6198 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 6199 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 6200 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 6201 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); 6202 6203 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6204 put4byte(pRight, apNew[nNew-1]->pgno); 6205 6206 /* 6207 ** Evenly distribute the data in apCell[] across the new pages. 6208 ** Insert divider cells into pParent as necessary. 6209 */ 6210 j = 0; 6211 for(i=0; i<nNew; i++){ 6212 /* Assemble the new sibling page. */ 6213 MemPage *pNew = apNew[i]; 6214 assert( j<nMaxCells ); 6215 zeroPage(pNew, pageFlags); 6216 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 6217 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 6218 assert( pNew->nOverflow==0 ); 6219 6220 j = cntNew[i]; 6221 6222 /* If the sibling page assembled above was not the right-most sibling, 6223 ** insert a divider cell into the parent page. 6224 */ 6225 assert( i<nNew-1 || j==nCell ); 6226 if( j<nCell ){ 6227 u8 *pCell; 6228 u8 *pTemp; 6229 int sz; 6230 6231 assert( j<nMaxCells ); 6232 pCell = apCell[j]; 6233 sz = szCell[j] + leafCorrection; 6234 pTemp = &aOvflSpace[iOvflSpace]; 6235 if( !pNew->leaf ){ 6236 memcpy(&pNew->aData[8], pCell, 4); 6237 }else if( leafData ){ 6238 /* If the tree is a leaf-data tree, and the siblings are leaves, 6239 ** then there is no divider cell in apCell[]. Instead, the divider 6240 ** cell consists of the integer key for the right-most cell of 6241 ** the sibling-page assembled above only. 6242 */ 6243 CellInfo info; 6244 j--; 6245 btreeParseCellPtr(pNew, apCell[j], &info); 6246 pCell = pTemp; 6247 sz = 4 + putVarint(&pCell[4], info.nKey); 6248 pTemp = 0; 6249 }else{ 6250 pCell -= 4; 6251 /* Obscure case for non-leaf-data trees: If the cell at pCell was 6252 ** previously stored on a leaf node, and its reported size was 4 6253 ** bytes, then it may actually be smaller than this 6254 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 6255 ** any cell). But it is important to pass the correct size to 6256 ** insertCell(), so reparse the cell now. 6257 ** 6258 ** Note that this can never happen in an SQLite data file, as all 6259 ** cells are at least 4 bytes. It only happens in b-trees used 6260 ** to evaluate "IN (SELECT ...)" and similar clauses. 6261 */ 6262 if( szCell[j]==4 ){ 6263 assert(leafCorrection==4); 6264 sz = cellSizePtr(pParent, pCell); 6265 } 6266 } 6267 iOvflSpace += sz; 6268 assert( sz<=pBt->maxLocal+23 ); 6269 assert( iOvflSpace <= (int)pBt->pageSize ); 6270 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); 6271 if( rc!=SQLITE_OK ) goto balance_cleanup; 6272 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6273 6274 j++; 6275 nxDiv++; 6276 } 6277 } 6278 assert( j==nCell ); 6279 assert( nOld>0 ); 6280 assert( nNew>0 ); 6281 if( (pageFlags & PTF_LEAF)==0 ){ 6282 u8 *zChild = &apCopy[nOld-1]->aData[8]; 6283 memcpy(&apNew[nNew-1]->aData[8], zChild, 4); 6284 } 6285 6286 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 6287 /* The root page of the b-tree now contains no cells. The only sibling 6288 ** page is the right-child of the parent. Copy the contents of the 6289 ** child page into the parent, decreasing the overall height of the 6290 ** b-tree structure by one. This is described as the "balance-shallower" 6291 ** sub-algorithm in some documentation. 6292 ** 6293 ** If this is an auto-vacuum database, the call to copyNodeContent() 6294 ** sets all pointer-map entries corresponding to database image pages 6295 ** for which the pointer is stored within the content being copied. 6296 ** 6297 ** The second assert below verifies that the child page is defragmented 6298 ** (it must be, as it was just reconstructed using assemblePage()). This 6299 ** is important if the parent page happens to be page 1 of the database 6300 ** image. */ 6301 assert( nNew==1 ); 6302 assert( apNew[0]->nFree == 6303 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) 6304 ); 6305 copyNodeContent(apNew[0], pParent, &rc); 6306 freePage(apNew[0], &rc); 6307 }else if( ISAUTOVACUUM ){ 6308 /* Fix the pointer-map entries for all the cells that were shifted around. 6309 ** There are several different types of pointer-map entries that need to 6310 ** be dealt with by this routine. Some of these have been set already, but 6311 ** many have not. The following is a summary: 6312 ** 6313 ** 1) The entries associated with new sibling pages that were not 6314 ** siblings when this function was called. These have already 6315 ** been set. We don't need to worry about old siblings that were 6316 ** moved to the free-list - the freePage() code has taken care 6317 ** of those. 6318 ** 6319 ** 2) The pointer-map entries associated with the first overflow 6320 ** page in any overflow chains used by new divider cells. These 6321 ** have also already been taken care of by the insertCell() code. 6322 ** 6323 ** 3) If the sibling pages are not leaves, then the child pages of 6324 ** cells stored on the sibling pages may need to be updated. 6325 ** 6326 ** 4) If the sibling pages are not internal intkey nodes, then any 6327 ** overflow pages used by these cells may need to be updated 6328 ** (internal intkey nodes never contain pointers to overflow pages). 6329 ** 6330 ** 5) If the sibling pages are not leaves, then the pointer-map 6331 ** entries for the right-child pages of each sibling may need 6332 ** to be updated. 6333 ** 6334 ** Cases 1 and 2 are dealt with above by other code. The next 6335 ** block deals with cases 3 and 4 and the one after that, case 5. Since 6336 ** setting a pointer map entry is a relatively expensive operation, this 6337 ** code only sets pointer map entries for child or overflow pages that have 6338 ** actually moved between pages. */ 6339 MemPage *pNew = apNew[0]; 6340 MemPage *pOld = apCopy[0]; 6341 int nOverflow = pOld->nOverflow; 6342 int iNextOld = pOld->nCell + nOverflow; 6343 int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1); 6344 j = 0; /* Current 'old' sibling page */ 6345 k = 0; /* Current 'new' sibling page */ 6346 for(i=0; i<nCell; i++){ 6347 int isDivider = 0; 6348 while( i==iNextOld ){ 6349 /* Cell i is the cell immediately following the last cell on old 6350 ** sibling page j. If the siblings are not leaf pages of an 6351 ** intkey b-tree, then cell i was a divider cell. */ 6352 pOld = apCopy[++j]; 6353 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; 6354 if( pOld->nOverflow ){ 6355 nOverflow = pOld->nOverflow; 6356 iOverflow = i + !leafData + pOld->aOvfl[0].idx; 6357 } 6358 isDivider = !leafData; 6359 } 6360 6361 assert(nOverflow>0 || iOverflow<i ); 6362 assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1); 6363 assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1); 6364 if( i==iOverflow ){ 6365 isDivider = 1; 6366 if( (--nOverflow)>0 ){ 6367 iOverflow++; 6368 } 6369 } 6370 6371 if( i==cntNew[k] ){ 6372 /* Cell i is the cell immediately following the last cell on new 6373 ** sibling page k. If the siblings are not leaf pages of an 6374 ** intkey b-tree, then cell i is a divider cell. */ 6375 pNew = apNew[++k]; 6376 if( !leafData ) continue; 6377 } 6378 assert( j<nOld ); 6379 assert( k<nNew ); 6380 6381 /* If the cell was originally divider cell (and is not now) or 6382 ** an overflow cell, or if the cell was located on a different sibling 6383 ** page before the balancing, then the pointer map entries associated 6384 ** with any child or overflow pages need to be updated. */ 6385 if( isDivider || pOld->pgno!=pNew->pgno ){ 6386 if( !leafCorrection ){ 6387 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); 6388 } 6389 if( szCell[i]>pNew->minLocal ){ 6390 ptrmapPutOvflPtr(pNew, apCell[i], &rc); 6391 } 6392 } 6393 } 6394 6395 if( !leafCorrection ){ 6396 for(i=0; i<nNew; i++){ 6397 u32 key = get4byte(&apNew[i]->aData[8]); 6398 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 6399 } 6400 } 6401 6402#if 0 6403 /* The ptrmapCheckPages() contains assert() statements that verify that 6404 ** all pointer map pages are set correctly. This is helpful while 6405 ** debugging. This is usually disabled because a corrupt database may 6406 ** cause an assert() statement to fail. */ 6407 ptrmapCheckPages(apNew, nNew); 6408 ptrmapCheckPages(&pParent, 1); 6409#endif 6410 } 6411 6412 assert( pParent->isInit ); 6413 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", 6414 nOld, nNew, nCell)); 6415 6416 /* 6417 ** Cleanup before returning. 6418 */ 6419balance_cleanup: 6420 sqlite3ScratchFree(apCell); 6421 for(i=0; i<nOld; i++){ 6422 releasePage(apOld[i]); 6423 } 6424 for(i=0; i<nNew; i++){ 6425 releasePage(apNew[i]); 6426 } 6427 6428 return rc; 6429} 6430 6431 6432/* 6433** This function is called when the root page of a b-tree structure is 6434** overfull (has one or more overflow pages). 6435** 6436** A new child page is allocated and the contents of the current root 6437** page, including overflow cells, are copied into the child. The root 6438** page is then overwritten to make it an empty page with the right-child 6439** pointer pointing to the new page. 6440** 6441** Before returning, all pointer-map entries corresponding to pages 6442** that the new child-page now contains pointers to are updated. The 6443** entry corresponding to the new right-child pointer of the root 6444** page is also updated. 6445** 6446** If successful, *ppChild is set to contain a reference to the child 6447** page and SQLITE_OK is returned. In this case the caller is required 6448** to call releasePage() on *ppChild exactly once. If an error occurs, 6449** an error code is returned and *ppChild is set to 0. 6450*/ 6451static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 6452 int rc; /* Return value from subprocedures */ 6453 MemPage *pChild = 0; /* Pointer to a new child page */ 6454 Pgno pgnoChild = 0; /* Page number of the new child page */ 6455 BtShared *pBt = pRoot->pBt; /* The BTree */ 6456 6457 assert( pRoot->nOverflow>0 ); 6458 assert( sqlite3_mutex_held(pBt->mutex) ); 6459 6460 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 6461 ** page that will become the new right-child of pPage. Copy the contents 6462 ** of the node stored on pRoot into the new child page. 6463 */ 6464 rc = sqlite3PagerWrite(pRoot->pDbPage); 6465 if( rc==SQLITE_OK ){ 6466 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 6467 copyNodeContent(pRoot, pChild, &rc); 6468 if( ISAUTOVACUUM ){ 6469 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 6470 } 6471 } 6472 if( rc ){ 6473 *ppChild = 0; 6474 releasePage(pChild); 6475 return rc; 6476 } 6477 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 6478 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 6479 assert( pChild->nCell==pRoot->nCell ); 6480 6481 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); 6482 6483 /* Copy the overflow cells from pRoot to pChild */ 6484 memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0])); 6485 pChild->nOverflow = pRoot->nOverflow; 6486 6487 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 6488 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 6489 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 6490 6491 *ppChild = pChild; 6492 return SQLITE_OK; 6493} 6494 6495/* 6496** The page that pCur currently points to has just been modified in 6497** some way. This function figures out if this modification means the 6498** tree needs to be balanced, and if so calls the appropriate balancing 6499** routine. Balancing routines are: 6500** 6501** balance_quick() 6502** balance_deeper() 6503** balance_nonroot() 6504*/ 6505static int balance(BtCursor *pCur){ 6506 int rc = SQLITE_OK; 6507 const int nMin = pCur->pBt->usableSize * 2 / 3; 6508 u8 aBalanceQuickSpace[13]; 6509 u8 *pFree = 0; 6510 6511 TESTONLY( int balance_quick_called = 0 ); 6512 TESTONLY( int balance_deeper_called = 0 ); 6513 6514 do { 6515 int iPage = pCur->iPage; 6516 MemPage *pPage = pCur->apPage[iPage]; 6517 6518 if( iPage==0 ){ 6519 if( pPage->nOverflow ){ 6520 /* The root page of the b-tree is overfull. In this case call the 6521 ** balance_deeper() function to create a new child for the root-page 6522 ** and copy the current contents of the root-page to it. The 6523 ** next iteration of the do-loop will balance the child page. 6524 */ 6525 assert( (balance_deeper_called++)==0 ); 6526 rc = balance_deeper(pPage, &pCur->apPage[1]); 6527 if( rc==SQLITE_OK ){ 6528 pCur->iPage = 1; 6529 pCur->aiIdx[0] = 0; 6530 pCur->aiIdx[1] = 0; 6531 assert( pCur->apPage[1]->nOverflow ); 6532 } 6533 }else{ 6534 break; 6535 } 6536 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ 6537 break; 6538 }else{ 6539 MemPage * const pParent = pCur->apPage[iPage-1]; 6540 int const iIdx = pCur->aiIdx[iPage-1]; 6541 6542 rc = sqlite3PagerWrite(pParent->pDbPage); 6543 if( rc==SQLITE_OK ){ 6544#ifndef SQLITE_OMIT_QUICKBALANCE 6545 if( pPage->hasData 6546 && pPage->nOverflow==1 6547 && pPage->aOvfl[0].idx==pPage->nCell 6548 && pParent->pgno!=1 6549 && pParent->nCell==iIdx 6550 ){ 6551 /* Call balance_quick() to create a new sibling of pPage on which 6552 ** to store the overflow cell. balance_quick() inserts a new cell 6553 ** into pParent, which may cause pParent overflow. If this 6554 ** happens, the next interation of the do-loop will balance pParent 6555 ** use either balance_nonroot() or balance_deeper(). Until this 6556 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 6557 ** buffer. 6558 ** 6559 ** The purpose of the following assert() is to check that only a 6560 ** single call to balance_quick() is made for each call to this 6561 ** function. If this were not verified, a subtle bug involving reuse 6562 ** of the aBalanceQuickSpace[] might sneak in. 6563 */ 6564 assert( (balance_quick_called++)==0 ); 6565 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 6566 }else 6567#endif 6568 { 6569 /* In this case, call balance_nonroot() to redistribute cells 6570 ** between pPage and up to 2 of its sibling pages. This involves 6571 ** modifying the contents of pParent, which may cause pParent to 6572 ** become overfull or underfull. The next iteration of the do-loop 6573 ** will balance the parent page to correct this. 6574 ** 6575 ** If the parent page becomes overfull, the overflow cell or cells 6576 ** are stored in the pSpace buffer allocated immediately below. 6577 ** A subsequent iteration of the do-loop will deal with this by 6578 ** calling balance_nonroot() (balance_deeper() may be called first, 6579 ** but it doesn't deal with overflow cells - just moves them to a 6580 ** different page). Once this subsequent call to balance_nonroot() 6581 ** has completed, it is safe to release the pSpace buffer used by 6582 ** the previous call, as the overflow cell data will have been 6583 ** copied either into the body of a database page or into the new 6584 ** pSpace buffer passed to the latter call to balance_nonroot(). 6585 */ 6586 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 6587 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1); 6588 if( pFree ){ 6589 /* If pFree is not NULL, it points to the pSpace buffer used 6590 ** by a previous call to balance_nonroot(). Its contents are 6591 ** now stored either on real database pages or within the 6592 ** new pSpace buffer, so it may be safely freed here. */ 6593 sqlite3PageFree(pFree); 6594 } 6595 6596 /* The pSpace buffer will be freed after the next call to 6597 ** balance_nonroot(), or just before this function returns, whichever 6598 ** comes first. */ 6599 pFree = pSpace; 6600 } 6601 } 6602 6603 pPage->nOverflow = 0; 6604 6605 /* The next iteration of the do-loop balances the parent page. */ 6606 releasePage(pPage); 6607 pCur->iPage--; 6608 } 6609 }while( rc==SQLITE_OK ); 6610 6611 if( pFree ){ 6612 sqlite3PageFree(pFree); 6613 } 6614 return rc; 6615} 6616 6617 6618/* 6619** Insert a new record into the BTree. The key is given by (pKey,nKey) 6620** and the data is given by (pData,nData). The cursor is used only to 6621** define what table the record should be inserted into. The cursor 6622** is left pointing at a random location. 6623** 6624** For an INTKEY table, only the nKey value of the key is used. pKey is 6625** ignored. For a ZERODATA table, the pData and nData are both ignored. 6626** 6627** If the seekResult parameter is non-zero, then a successful call to 6628** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already 6629** been performed. seekResult is the search result returned (a negative 6630** number if pCur points at an entry that is smaller than (pKey, nKey), or 6631** a positive value if pCur points at an etry that is larger than 6632** (pKey, nKey)). 6633** 6634** If the seekResult parameter is non-zero, then the caller guarantees that 6635** cursor pCur is pointing at the existing copy of a row that is to be 6636** overwritten. If the seekResult parameter is 0, then cursor pCur may 6637** point to any entry or to no entry at all and so this function has to seek 6638** the cursor before the new key can be inserted. 6639*/ 6640int sqlite3BtreeInsert( 6641 BtCursor *pCur, /* Insert data into the table of this cursor */ 6642 const void *pKey, i64 nKey, /* The key of the new record */ 6643 const void *pData, int nData, /* The data of the new record */ 6644 int nZero, /* Number of extra 0 bytes to append to data */ 6645 int appendBias, /* True if this is likely an append */ 6646 int seekResult /* Result of prior MovetoUnpacked() call */ 6647){ 6648 int rc; 6649 int loc = seekResult; /* -1: before desired location +1: after */ 6650 int szNew = 0; 6651 int idx; 6652 MemPage *pPage; 6653 Btree *p = pCur->pBtree; 6654 BtShared *pBt = p->pBt; 6655 unsigned char *oldCell; 6656 unsigned char *newCell = 0; 6657 6658 if( pCur->eState==CURSOR_FAULT ){ 6659 assert( pCur->skipNext!=SQLITE_OK ); 6660 return pCur->skipNext; 6661 } 6662 6663 assert( cursorHoldsMutex(pCur) ); 6664 assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly ); 6665 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 6666 6667 /* Assert that the caller has been consistent. If this cursor was opened 6668 ** expecting an index b-tree, then the caller should be inserting blob 6669 ** keys with no associated data. If the cursor was opened expecting an 6670 ** intkey table, the caller should be inserting integer keys with a 6671 ** blob of associated data. */ 6672 assert( (pKey==0)==(pCur->pKeyInfo==0) ); 6673 6674 /* If this is an insert into a table b-tree, invalidate any incrblob 6675 ** cursors open on the row being replaced (assuming this is a replace 6676 ** operation - if it is not, the following is a no-op). */ 6677 if( pCur->pKeyInfo==0 ){ 6678 invalidateIncrblobCursors(p, nKey, 0); 6679 } 6680 6681 /* Save the positions of any other cursors open on this table. 6682 ** 6683 ** In some cases, the call to btreeMoveto() below is a no-op. For 6684 ** example, when inserting data into a table with auto-generated integer 6685 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 6686 ** integer key to use. It then calls this function to actually insert the 6687 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 6688 ** that the cursor is already where it needs to be and returns without 6689 ** doing any work. To avoid thwarting these optimizations, it is important 6690 ** not to clear the cursor here. 6691 */ 6692 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 6693 if( rc ) return rc; 6694 if( !loc ){ 6695 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc); 6696 if( rc ) return rc; 6697 } 6698 assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) ); 6699 6700 pPage = pCur->apPage[pCur->iPage]; 6701 assert( pPage->intKey || nKey>=0 ); 6702 assert( pPage->leaf || !pPage->intKey ); 6703 6704 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 6705 pCur->pgnoRoot, nKey, nData, pPage->pgno, 6706 loc==0 ? "overwrite" : "new entry")); 6707 assert( pPage->isInit ); 6708 allocateTempSpace(pBt); 6709 newCell = pBt->pTmpSpace; 6710 if( newCell==0 ) return SQLITE_NOMEM; 6711 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 6712 if( rc ) goto end_insert; 6713 assert( szNew==cellSizePtr(pPage, newCell) ); 6714 assert( szNew <= MX_CELL_SIZE(pBt) ); 6715 idx = pCur->aiIdx[pCur->iPage]; 6716 if( loc==0 ){ 6717 u16 szOld; 6718 assert( idx<pPage->nCell ); 6719 rc = sqlite3PagerWrite(pPage->pDbPage); 6720 if( rc ){ 6721 goto end_insert; 6722 } 6723 oldCell = findCell(pPage, idx); 6724 if( !pPage->leaf ){ 6725 memcpy(newCell, oldCell, 4); 6726 } 6727 szOld = cellSizePtr(pPage, oldCell); 6728 rc = clearCell(pPage, oldCell); 6729 dropCell(pPage, idx, szOld, &rc); 6730 if( rc ) goto end_insert; 6731 }else if( loc<0 && pPage->nCell>0 ){ 6732 assert( pPage->leaf ); 6733 idx = ++pCur->aiIdx[pCur->iPage]; 6734 }else{ 6735 assert( pPage->leaf ); 6736 } 6737 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); 6738 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 6739 6740 /* If no error has occured and pPage has an overflow cell, call balance() 6741 ** to redistribute the cells within the tree. Since balance() may move 6742 ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey 6743 ** variables. 6744 ** 6745 ** Previous versions of SQLite called moveToRoot() to move the cursor 6746 ** back to the root page as balance() used to invalidate the contents 6747 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 6748 ** set the cursor state to "invalid". This makes common insert operations 6749 ** slightly faster. 6750 ** 6751 ** There is a subtle but important optimization here too. When inserting 6752 ** multiple records into an intkey b-tree using a single cursor (as can 6753 ** happen while processing an "INSERT INTO ... SELECT" statement), it 6754 ** is advantageous to leave the cursor pointing to the last entry in 6755 ** the b-tree if possible. If the cursor is left pointing to the last 6756 ** entry in the table, and the next row inserted has an integer key 6757 ** larger than the largest existing key, it is possible to insert the 6758 ** row without seeking the cursor. This can be a big performance boost. 6759 */ 6760 pCur->info.nSize = 0; 6761 pCur->validNKey = 0; 6762 if( rc==SQLITE_OK && pPage->nOverflow ){ 6763 rc = balance(pCur); 6764 6765 /* Must make sure nOverflow is reset to zero even if the balance() 6766 ** fails. Internal data structure corruption will result otherwise. 6767 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 6768 ** from trying to save the current position of the cursor. */ 6769 pCur->apPage[pCur->iPage]->nOverflow = 0; 6770 pCur->eState = CURSOR_INVALID; 6771 } 6772 assert( pCur->apPage[pCur->iPage]->nOverflow==0 ); 6773 6774end_insert: 6775 return rc; 6776} 6777 6778/* 6779** Delete the entry that the cursor is pointing to. The cursor 6780** is left pointing at a arbitrary location. 6781*/ 6782int sqlite3BtreeDelete(BtCursor *pCur){ 6783 Btree *p = pCur->pBtree; 6784 BtShared *pBt = p->pBt; 6785 int rc; /* Return code */ 6786 MemPage *pPage; /* Page to delete cell from */ 6787 unsigned char *pCell; /* Pointer to cell to delete */ 6788 int iCellIdx; /* Index of cell to delete */ 6789 int iCellDepth; /* Depth of node containing pCell */ 6790 6791 assert( cursorHoldsMutex(pCur) ); 6792 assert( pBt->inTransaction==TRANS_WRITE ); 6793 assert( !pBt->readOnly ); 6794 assert( pCur->wrFlag ); 6795 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 6796 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 6797 6798 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell) 6799 || NEVER(pCur->eState!=CURSOR_VALID) 6800 ){ 6801 return SQLITE_ERROR; /* Something has gone awry. */ 6802 } 6803 6804 /* If this is a delete operation to remove a row from a table b-tree, 6805 ** invalidate any incrblob cursors open on the row being deleted. */ 6806 if( pCur->pKeyInfo==0 ){ 6807 invalidateIncrblobCursors(p, pCur->info.nKey, 0); 6808 } 6809 6810 iCellDepth = pCur->iPage; 6811 iCellIdx = pCur->aiIdx[iCellDepth]; 6812 pPage = pCur->apPage[iCellDepth]; 6813 pCell = findCell(pPage, iCellIdx); 6814 6815 /* If the page containing the entry to delete is not a leaf page, move 6816 ** the cursor to the largest entry in the tree that is smaller than 6817 ** the entry being deleted. This cell will replace the cell being deleted 6818 ** from the internal node. The 'previous' entry is used for this instead 6819 ** of the 'next' entry, as the previous entry is always a part of the 6820 ** sub-tree headed by the child page of the cell being deleted. This makes 6821 ** balancing the tree following the delete operation easier. */ 6822 if( !pPage->leaf ){ 6823 int notUsed; 6824 rc = sqlite3BtreePrevious(pCur, ¬Used); 6825 if( rc ) return rc; 6826 } 6827 6828 /* Save the positions of any other cursors open on this table before 6829 ** making any modifications. Make the page containing the entry to be 6830 ** deleted writable. Then free any overflow pages associated with the 6831 ** entry and finally remove the cell itself from within the page. 6832 */ 6833 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 6834 if( rc ) return rc; 6835 rc = sqlite3PagerWrite(pPage->pDbPage); 6836 if( rc ) return rc; 6837 rc = clearCell(pPage, pCell); 6838 dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc); 6839 if( rc ) return rc; 6840 6841 /* If the cell deleted was not located on a leaf page, then the cursor 6842 ** is currently pointing to the largest entry in the sub-tree headed 6843 ** by the child-page of the cell that was just deleted from an internal 6844 ** node. The cell from the leaf node needs to be moved to the internal 6845 ** node to replace the deleted cell. */ 6846 if( !pPage->leaf ){ 6847 MemPage *pLeaf = pCur->apPage[pCur->iPage]; 6848 int nCell; 6849 Pgno n = pCur->apPage[iCellDepth+1]->pgno; 6850 unsigned char *pTmp; 6851 6852 pCell = findCell(pLeaf, pLeaf->nCell-1); 6853 nCell = cellSizePtr(pLeaf, pCell); 6854 assert( MX_CELL_SIZE(pBt) >= nCell ); 6855 6856 allocateTempSpace(pBt); 6857 pTmp = pBt->pTmpSpace; 6858 6859 rc = sqlite3PagerWrite(pLeaf->pDbPage); 6860 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); 6861 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 6862 if( rc ) return rc; 6863 } 6864 6865 /* Balance the tree. If the entry deleted was located on a leaf page, 6866 ** then the cursor still points to that page. In this case the first 6867 ** call to balance() repairs the tree, and the if(...) condition is 6868 ** never true. 6869 ** 6870 ** Otherwise, if the entry deleted was on an internal node page, then 6871 ** pCur is pointing to the leaf page from which a cell was removed to 6872 ** replace the cell deleted from the internal node. This is slightly 6873 ** tricky as the leaf node may be underfull, and the internal node may 6874 ** be either under or overfull. In this case run the balancing algorithm 6875 ** on the leaf node first. If the balance proceeds far enough up the 6876 ** tree that we can be sure that any problem in the internal node has 6877 ** been corrected, so be it. Otherwise, after balancing the leaf node, 6878 ** walk the cursor up the tree to the internal node and balance it as 6879 ** well. */ 6880 rc = balance(pCur); 6881 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 6882 while( pCur->iPage>iCellDepth ){ 6883 releasePage(pCur->apPage[pCur->iPage--]); 6884 } 6885 rc = balance(pCur); 6886 } 6887 6888 if( rc==SQLITE_OK ){ 6889 moveToRoot(pCur); 6890 } 6891 return rc; 6892} 6893 6894/* 6895** Create a new BTree table. Write into *piTable the page 6896** number for the root page of the new table. 6897** 6898** The type of type is determined by the flags parameter. Only the 6899** following values of flags are currently in use. Other values for 6900** flags might not work: 6901** 6902** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 6903** BTREE_ZERODATA Used for SQL indices 6904*/ 6905static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ 6906 BtShared *pBt = p->pBt; 6907 MemPage *pRoot; 6908 Pgno pgnoRoot; 6909 int rc; 6910 int ptfFlags; /* Page-type flage for the root page of new table */ 6911 6912 assert( sqlite3BtreeHoldsMutex(p) ); 6913 assert( pBt->inTransaction==TRANS_WRITE ); 6914 assert( !pBt->readOnly ); 6915 6916#ifdef SQLITE_OMIT_AUTOVACUUM 6917 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 6918 if( rc ){ 6919 return rc; 6920 } 6921#else 6922 if( pBt->autoVacuum ){ 6923 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 6924 MemPage *pPageMove; /* The page to move to. */ 6925 6926 /* Creating a new table may probably require moving an existing database 6927 ** to make room for the new tables root page. In case this page turns 6928 ** out to be an overflow page, delete all overflow page-map caches 6929 ** held by open cursors. 6930 */ 6931 invalidateAllOverflowCache(pBt); 6932 6933 /* Read the value of meta[3] from the database to determine where the 6934 ** root page of the new table should go. meta[3] is the largest root-page 6935 ** created so far, so the new root-page is (meta[3]+1). 6936 */ 6937 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 6938 pgnoRoot++; 6939 6940 /* The new root-page may not be allocated on a pointer-map page, or the 6941 ** PENDING_BYTE page. 6942 */ 6943 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 6944 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 6945 pgnoRoot++; 6946 } 6947 assert( pgnoRoot>=3 ); 6948 6949 /* Allocate a page. The page that currently resides at pgnoRoot will 6950 ** be moved to the allocated page (unless the allocated page happens 6951 ** to reside at pgnoRoot). 6952 */ 6953 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); 6954 if( rc!=SQLITE_OK ){ 6955 return rc; 6956 } 6957 6958 if( pgnoMove!=pgnoRoot ){ 6959 /* pgnoRoot is the page that will be used for the root-page of 6960 ** the new table (assuming an error did not occur). But we were 6961 ** allocated pgnoMove. If required (i.e. if it was not allocated 6962 ** by extending the file), the current page at position pgnoMove 6963 ** is already journaled. 6964 */ 6965 u8 eType = 0; 6966 Pgno iPtrPage = 0; 6967 6968 releasePage(pPageMove); 6969 6970 /* Move the page currently at pgnoRoot to pgnoMove. */ 6971 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6972 if( rc!=SQLITE_OK ){ 6973 return rc; 6974 } 6975 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 6976 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 6977 rc = SQLITE_CORRUPT_BKPT; 6978 } 6979 if( rc!=SQLITE_OK ){ 6980 releasePage(pRoot); 6981 return rc; 6982 } 6983 assert( eType!=PTRMAP_ROOTPAGE ); 6984 assert( eType!=PTRMAP_FREEPAGE ); 6985 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 6986 releasePage(pRoot); 6987 6988 /* Obtain the page at pgnoRoot */ 6989 if( rc!=SQLITE_OK ){ 6990 return rc; 6991 } 6992 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 6993 if( rc!=SQLITE_OK ){ 6994 return rc; 6995 } 6996 rc = sqlite3PagerWrite(pRoot->pDbPage); 6997 if( rc!=SQLITE_OK ){ 6998 releasePage(pRoot); 6999 return rc; 7000 } 7001 }else{ 7002 pRoot = pPageMove; 7003 } 7004 7005 /* Update the pointer-map and meta-data with the new root-page number. */ 7006 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 7007 if( rc ){ 7008 releasePage(pRoot); 7009 return rc; 7010 } 7011 7012 /* When the new root page was allocated, page 1 was made writable in 7013 ** order either to increase the database filesize, or to decrement the 7014 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 7015 */ 7016 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 7017 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 7018 if( NEVER(rc) ){ 7019 releasePage(pRoot); 7020 return rc; 7021 } 7022 7023 }else{ 7024 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 7025 if( rc ) return rc; 7026 } 7027#endif 7028 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 7029 if( createTabFlags & BTREE_INTKEY ){ 7030 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 7031 }else{ 7032 ptfFlags = PTF_ZERODATA | PTF_LEAF; 7033 } 7034 zeroPage(pRoot, ptfFlags); 7035 sqlite3PagerUnref(pRoot->pDbPage); 7036 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 7037 *piTable = (int)pgnoRoot; 7038 return SQLITE_OK; 7039} 7040int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 7041 int rc; 7042 sqlite3BtreeEnter(p); 7043 rc = btreeCreateTable(p, piTable, flags); 7044 sqlite3BtreeLeave(p); 7045 return rc; 7046} 7047 7048/* 7049** Erase the given database page and all its children. Return 7050** the page to the freelist. 7051*/ 7052static int clearDatabasePage( 7053 BtShared *pBt, /* The BTree that contains the table */ 7054 Pgno pgno, /* Page number to clear */ 7055 int freePageFlag, /* Deallocate page if true */ 7056 int *pnChange /* Add number of Cells freed to this counter */ 7057){ 7058 MemPage *pPage; 7059 int rc; 7060 unsigned char *pCell; 7061 int i; 7062 7063 assert( sqlite3_mutex_held(pBt->mutex) ); 7064 if( pgno>btreePagecount(pBt) ){ 7065 return SQLITE_CORRUPT_BKPT; 7066 } 7067 7068 rc = getAndInitPage(pBt, pgno, &pPage); 7069 if( rc ) return rc; 7070 for(i=0; i<pPage->nCell; i++){ 7071 pCell = findCell(pPage, i); 7072 if( !pPage->leaf ){ 7073 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 7074 if( rc ) goto cleardatabasepage_out; 7075 } 7076 rc = clearCell(pPage, pCell); 7077 if( rc ) goto cleardatabasepage_out; 7078 } 7079 if( !pPage->leaf ){ 7080 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange); 7081 if( rc ) goto cleardatabasepage_out; 7082 }else if( pnChange ){ 7083 assert( pPage->intKey ); 7084 *pnChange += pPage->nCell; 7085 } 7086 if( freePageFlag ){ 7087 freePage(pPage, &rc); 7088 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 7089 zeroPage(pPage, pPage->aData[0] | PTF_LEAF); 7090 } 7091 7092cleardatabasepage_out: 7093 releasePage(pPage); 7094 return rc; 7095} 7096 7097/* 7098** Delete all information from a single table in the database. iTable is 7099** the page number of the root of the table. After this routine returns, 7100** the root page is empty, but still exists. 7101** 7102** This routine will fail with SQLITE_LOCKED if there are any open 7103** read cursors on the table. Open write cursors are moved to the 7104** root of the table. 7105** 7106** If pnChange is not NULL, then table iTable must be an intkey table. The 7107** integer value pointed to by pnChange is incremented by the number of 7108** entries in the table. 7109*/ 7110int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ 7111 int rc; 7112 BtShared *pBt = p->pBt; 7113 sqlite3BtreeEnter(p); 7114 assert( p->inTrans==TRANS_WRITE ); 7115 7116 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 7117 ** is the root of a table b-tree - if it is not, the following call is 7118 ** a no-op). */ 7119 invalidateIncrblobCursors(p, 0, 1); 7120 7121 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 7122 if( SQLITE_OK==rc ){ 7123 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 7124 } 7125 sqlite3BtreeLeave(p); 7126 return rc; 7127} 7128 7129/* 7130** Erase all information in a table and add the root of the table to 7131** the freelist. Except, the root of the principle table (the one on 7132** page 1) is never added to the freelist. 7133** 7134** This routine will fail with SQLITE_LOCKED if there are any open 7135** cursors on the table. 7136** 7137** If AUTOVACUUM is enabled and the page at iTable is not the last 7138** root page in the database file, then the last root page 7139** in the database file is moved into the slot formerly occupied by 7140** iTable and that last slot formerly occupied by the last root page 7141** is added to the freelist instead of iTable. In this say, all 7142** root pages are kept at the beginning of the database file, which 7143** is necessary for AUTOVACUUM to work right. *piMoved is set to the 7144** page number that used to be the last root page in the file before 7145** the move. If no page gets moved, *piMoved is set to 0. 7146** The last root page is recorded in meta[3] and the value of 7147** meta[3] is updated by this procedure. 7148*/ 7149static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 7150 int rc; 7151 MemPage *pPage = 0; 7152 BtShared *pBt = p->pBt; 7153 7154 assert( sqlite3BtreeHoldsMutex(p) ); 7155 assert( p->inTrans==TRANS_WRITE ); 7156 7157 /* It is illegal to drop a table if any cursors are open on the 7158 ** database. This is because in auto-vacuum mode the backend may 7159 ** need to move another root-page to fill a gap left by the deleted 7160 ** root page. If an open cursor was using this page a problem would 7161 ** occur. 7162 ** 7163 ** This error is caught long before control reaches this point. 7164 */ 7165 if( NEVER(pBt->pCursor) ){ 7166 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db); 7167 return SQLITE_LOCKED_SHAREDCACHE; 7168 } 7169 7170 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 7171 if( rc ) return rc; 7172 rc = sqlite3BtreeClearTable(p, iTable, 0); 7173 if( rc ){ 7174 releasePage(pPage); 7175 return rc; 7176 } 7177 7178 *piMoved = 0; 7179 7180 if( iTable>1 ){ 7181#ifdef SQLITE_OMIT_AUTOVACUUM 7182 freePage(pPage, &rc); 7183 releasePage(pPage); 7184#else 7185 if( pBt->autoVacuum ){ 7186 Pgno maxRootPgno; 7187 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 7188 7189 if( iTable==maxRootPgno ){ 7190 /* If the table being dropped is the table with the largest root-page 7191 ** number in the database, put the root page on the free list. 7192 */ 7193 freePage(pPage, &rc); 7194 releasePage(pPage); 7195 if( rc!=SQLITE_OK ){ 7196 return rc; 7197 } 7198 }else{ 7199 /* The table being dropped does not have the largest root-page 7200 ** number in the database. So move the page that does into the 7201 ** gap left by the deleted root-page. 7202 */ 7203 MemPage *pMove; 7204 releasePage(pPage); 7205 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7206 if( rc!=SQLITE_OK ){ 7207 return rc; 7208 } 7209 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 7210 releasePage(pMove); 7211 if( rc!=SQLITE_OK ){ 7212 return rc; 7213 } 7214 pMove = 0; 7215 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7216 freePage(pMove, &rc); 7217 releasePage(pMove); 7218 if( rc!=SQLITE_OK ){ 7219 return rc; 7220 } 7221 *piMoved = maxRootPgno; 7222 } 7223 7224 /* Set the new 'max-root-page' value in the database header. This 7225 ** is the old value less one, less one more if that happens to 7226 ** be a root-page number, less one again if that is the 7227 ** PENDING_BYTE_PAGE. 7228 */ 7229 maxRootPgno--; 7230 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 7231 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 7232 maxRootPgno--; 7233 } 7234 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 7235 7236 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 7237 }else{ 7238 freePage(pPage, &rc); 7239 releasePage(pPage); 7240 } 7241#endif 7242 }else{ 7243 /* If sqlite3BtreeDropTable was called on page 1. 7244 ** This really never should happen except in a corrupt 7245 ** database. 7246 */ 7247 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 7248 releasePage(pPage); 7249 } 7250 return rc; 7251} 7252int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 7253 int rc; 7254 sqlite3BtreeEnter(p); 7255 rc = btreeDropTable(p, iTable, piMoved); 7256 sqlite3BtreeLeave(p); 7257 return rc; 7258} 7259 7260 7261/* 7262** This function may only be called if the b-tree connection already 7263** has a read or write transaction open on the database. 7264** 7265** Read the meta-information out of a database file. Meta[0] 7266** is the number of free pages currently in the database. Meta[1] 7267** through meta[15] are available for use by higher layers. Meta[0] 7268** is read-only, the others are read/write. 7269** 7270** The schema layer numbers meta values differently. At the schema 7271** layer (and the SetCookie and ReadCookie opcodes) the number of 7272** free pages is not visible. So Cookie[0] is the same as Meta[1]. 7273*/ 7274void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 7275 BtShared *pBt = p->pBt; 7276 7277 sqlite3BtreeEnter(p); 7278 assert( p->inTrans>TRANS_NONE ); 7279 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) ); 7280 assert( pBt->pPage1 ); 7281 assert( idx>=0 && idx<=15 ); 7282 7283 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 7284 7285 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 7286 ** database, mark the database as read-only. */ 7287#ifdef SQLITE_OMIT_AUTOVACUUM 7288 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1; 7289#endif 7290 7291 sqlite3BtreeLeave(p); 7292} 7293 7294/* 7295** Write meta-information back into the database. Meta[0] is 7296** read-only and may not be written. 7297*/ 7298int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 7299 BtShared *pBt = p->pBt; 7300 unsigned char *pP1; 7301 int rc; 7302 assert( idx>=1 && idx<=15 ); 7303 sqlite3BtreeEnter(p); 7304 assert( p->inTrans==TRANS_WRITE ); 7305 assert( pBt->pPage1!=0 ); 7306 pP1 = pBt->pPage1->aData; 7307 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 7308 if( rc==SQLITE_OK ){ 7309 put4byte(&pP1[36 + idx*4], iMeta); 7310#ifndef SQLITE_OMIT_AUTOVACUUM 7311 if( idx==BTREE_INCR_VACUUM ){ 7312 assert( pBt->autoVacuum || iMeta==0 ); 7313 assert( iMeta==0 || iMeta==1 ); 7314 pBt->incrVacuum = (u8)iMeta; 7315 } 7316#endif 7317 } 7318 sqlite3BtreeLeave(p); 7319 return rc; 7320} 7321 7322#ifndef SQLITE_OMIT_BTREECOUNT 7323/* 7324** The first argument, pCur, is a cursor opened on some b-tree. Count the 7325** number of entries in the b-tree and write the result to *pnEntry. 7326** 7327** SQLITE_OK is returned if the operation is successfully executed. 7328** Otherwise, if an error is encountered (i.e. an IO error or database 7329** corruption) an SQLite error code is returned. 7330*/ 7331int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ 7332 i64 nEntry = 0; /* Value to return in *pnEntry */ 7333 int rc; /* Return code */ 7334 rc = moveToRoot(pCur); 7335 7336 /* Unless an error occurs, the following loop runs one iteration for each 7337 ** page in the B-Tree structure (not including overflow pages). 7338 */ 7339 while( rc==SQLITE_OK ){ 7340 int iIdx; /* Index of child node in parent */ 7341 MemPage *pPage; /* Current page of the b-tree */ 7342 7343 /* If this is a leaf page or the tree is not an int-key tree, then 7344 ** this page contains countable entries. Increment the entry counter 7345 ** accordingly. 7346 */ 7347 pPage = pCur->apPage[pCur->iPage]; 7348 if( pPage->leaf || !pPage->intKey ){ 7349 nEntry += pPage->nCell; 7350 } 7351 7352 /* pPage is a leaf node. This loop navigates the cursor so that it 7353 ** points to the first interior cell that it points to the parent of 7354 ** the next page in the tree that has not yet been visited. The 7355 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 7356 ** of the page, or to the number of cells in the page if the next page 7357 ** to visit is the right-child of its parent. 7358 ** 7359 ** If all pages in the tree have been visited, return SQLITE_OK to the 7360 ** caller. 7361 */ 7362 if( pPage->leaf ){ 7363 do { 7364 if( pCur->iPage==0 ){ 7365 /* All pages of the b-tree have been visited. Return successfully. */ 7366 *pnEntry = nEntry; 7367 return SQLITE_OK; 7368 } 7369 moveToParent(pCur); 7370 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); 7371 7372 pCur->aiIdx[pCur->iPage]++; 7373 pPage = pCur->apPage[pCur->iPage]; 7374 } 7375 7376 /* Descend to the child node of the cell that the cursor currently 7377 ** points at. This is the right-child if (iIdx==pPage->nCell). 7378 */ 7379 iIdx = pCur->aiIdx[pCur->iPage]; 7380 if( iIdx==pPage->nCell ){ 7381 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 7382 }else{ 7383 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 7384 } 7385 } 7386 7387 /* An error has occurred. Return an error code. */ 7388 return rc; 7389} 7390#endif 7391 7392/* 7393** Return the pager associated with a BTree. This routine is used for 7394** testing and debugging only. 7395*/ 7396Pager *sqlite3BtreePager(Btree *p){ 7397 return p->pBt->pPager; 7398} 7399 7400#ifndef SQLITE_OMIT_INTEGRITY_CHECK 7401/* 7402** Append a message to the error message string. 7403*/ 7404static void checkAppendMsg( 7405 IntegrityCk *pCheck, 7406 char *zMsg1, 7407 const char *zFormat, 7408 ... 7409){ 7410 va_list ap; 7411 if( !pCheck->mxErr ) return; 7412 pCheck->mxErr--; 7413 pCheck->nErr++; 7414 va_start(ap, zFormat); 7415 if( pCheck->errMsg.nChar ){ 7416 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); 7417 } 7418 if( zMsg1 ){ 7419 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1); 7420 } 7421 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); 7422 va_end(ap); 7423 if( pCheck->errMsg.mallocFailed ){ 7424 pCheck->mallocFailed = 1; 7425 } 7426} 7427#endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7428 7429#ifndef SQLITE_OMIT_INTEGRITY_CHECK 7430/* 7431** Add 1 to the reference count for page iPage. If this is the second 7432** reference to the page, add an error message to pCheck->zErrMsg. 7433** Return 1 if there are 2 ore more references to the page and 0 if 7434** if this is the first reference to the page. 7435** 7436** Also check that the page number is in bounds. 7437*/ 7438static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){ 7439 if( iPage==0 ) return 1; 7440 if( iPage>pCheck->nPage ){ 7441 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); 7442 return 1; 7443 } 7444 if( pCheck->anRef[iPage]==1 ){ 7445 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); 7446 return 1; 7447 } 7448 return (pCheck->anRef[iPage]++)>1; 7449} 7450 7451#ifndef SQLITE_OMIT_AUTOVACUUM 7452/* 7453** Check that the entry in the pointer-map for page iChild maps to 7454** page iParent, pointer type ptrType. If not, append an error message 7455** to pCheck. 7456*/ 7457static void checkPtrmap( 7458 IntegrityCk *pCheck, /* Integrity check context */ 7459 Pgno iChild, /* Child page number */ 7460 u8 eType, /* Expected pointer map type */ 7461 Pgno iParent, /* Expected pointer map parent page number */ 7462 char *zContext /* Context description (used for error msg) */ 7463){ 7464 int rc; 7465 u8 ePtrmapType; 7466 Pgno iPtrmapParent; 7467 7468 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 7469 if( rc!=SQLITE_OK ){ 7470 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; 7471 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); 7472 return; 7473 } 7474 7475 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 7476 checkAppendMsg(pCheck, zContext, 7477 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 7478 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 7479 } 7480} 7481#endif 7482 7483/* 7484** Check the integrity of the freelist or of an overflow page list. 7485** Verify that the number of pages on the list is N. 7486*/ 7487static void checkList( 7488 IntegrityCk *pCheck, /* Integrity checking context */ 7489 int isFreeList, /* True for a freelist. False for overflow page list */ 7490 int iPage, /* Page number for first page in the list */ 7491 int N, /* Expected number of pages in the list */ 7492 char *zContext /* Context for error messages */ 7493){ 7494 int i; 7495 int expected = N; 7496 int iFirst = iPage; 7497 while( N-- > 0 && pCheck->mxErr ){ 7498 DbPage *pOvflPage; 7499 unsigned char *pOvflData; 7500 if( iPage<1 ){ 7501 checkAppendMsg(pCheck, zContext, 7502 "%d of %d pages missing from overflow list starting at %d", 7503 N+1, expected, iFirst); 7504 break; 7505 } 7506 if( checkRef(pCheck, iPage, zContext) ) break; 7507 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 7508 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); 7509 break; 7510 } 7511 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 7512 if( isFreeList ){ 7513 int n = get4byte(&pOvflData[4]); 7514#ifndef SQLITE_OMIT_AUTOVACUUM 7515 if( pCheck->pBt->autoVacuum ){ 7516 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); 7517 } 7518#endif 7519 if( n>(int)pCheck->pBt->usableSize/4-2 ){ 7520 checkAppendMsg(pCheck, zContext, 7521 "freelist leaf count too big on page %d", iPage); 7522 N--; 7523 }else{ 7524 for(i=0; i<n; i++){ 7525 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 7526#ifndef SQLITE_OMIT_AUTOVACUUM 7527 if( pCheck->pBt->autoVacuum ){ 7528 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); 7529 } 7530#endif 7531 checkRef(pCheck, iFreePage, zContext); 7532 } 7533 N -= n; 7534 } 7535 } 7536#ifndef SQLITE_OMIT_AUTOVACUUM 7537 else{ 7538 /* If this database supports auto-vacuum and iPage is not the last 7539 ** page in this overflow list, check that the pointer-map entry for 7540 ** the following page matches iPage. 7541 */ 7542 if( pCheck->pBt->autoVacuum && N>0 ){ 7543 i = get4byte(pOvflData); 7544 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); 7545 } 7546 } 7547#endif 7548 iPage = get4byte(pOvflData); 7549 sqlite3PagerUnref(pOvflPage); 7550 } 7551} 7552#endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7553 7554#ifndef SQLITE_OMIT_INTEGRITY_CHECK 7555/* 7556** Do various sanity checks on a single page of a tree. Return 7557** the tree depth. Root pages return 0. Parents of root pages 7558** return 1, and so forth. 7559** 7560** These checks are done: 7561** 7562** 1. Make sure that cells and freeblocks do not overlap 7563** but combine to completely cover the page. 7564** NO 2. Make sure cell keys are in order. 7565** NO 3. Make sure no key is less than or equal to zLowerBound. 7566** NO 4. Make sure no key is greater than or equal to zUpperBound. 7567** 5. Check the integrity of overflow pages. 7568** 6. Recursively call checkTreePage on all children. 7569** 7. Verify that the depth of all children is the same. 7570** 8. Make sure this page is at least 33% full or else it is 7571** the root of the tree. 7572*/ 7573static int checkTreePage( 7574 IntegrityCk *pCheck, /* Context for the sanity check */ 7575 int iPage, /* Page number of the page to check */ 7576 char *zParentContext, /* Parent context */ 7577 i64 *pnParentMinKey, 7578 i64 *pnParentMaxKey 7579){ 7580 MemPage *pPage; 7581 int i, rc, depth, d2, pgno, cnt; 7582 int hdr, cellStart; 7583 int nCell; 7584 u8 *data; 7585 BtShared *pBt; 7586 int usableSize; 7587 char zContext[100]; 7588 char *hit = 0; 7589 i64 nMinKey = 0; 7590 i64 nMaxKey = 0; 7591 7592 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); 7593 7594 /* Check that the page exists 7595 */ 7596 pBt = pCheck->pBt; 7597 usableSize = pBt->usableSize; 7598 if( iPage==0 ) return 0; 7599 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 7600 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 7601 checkAppendMsg(pCheck, zContext, 7602 "unable to get the page. error code=%d", rc); 7603 return 0; 7604 } 7605 7606 /* Clear MemPage.isInit to make sure the corruption detection code in 7607 ** btreeInitPage() is executed. */ 7608 pPage->isInit = 0; 7609 if( (rc = btreeInitPage(pPage))!=0 ){ 7610 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 7611 checkAppendMsg(pCheck, zContext, 7612 "btreeInitPage() returns error code %d", rc); 7613 releasePage(pPage); 7614 return 0; 7615 } 7616 7617 /* Check out all the cells. 7618 */ 7619 depth = 0; 7620 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 7621 u8 *pCell; 7622 u32 sz; 7623 CellInfo info; 7624 7625 /* Check payload overflow pages 7626 */ 7627 sqlite3_snprintf(sizeof(zContext), zContext, 7628 "On tree page %d cell %d: ", iPage, i); 7629 pCell = findCell(pPage,i); 7630 btreeParseCellPtr(pPage, pCell, &info); 7631 sz = info.nData; 7632 if( !pPage->intKey ) sz += (int)info.nKey; 7633 /* For intKey pages, check that the keys are in order. 7634 */ 7635 else if( i==0 ) nMinKey = nMaxKey = info.nKey; 7636 else{ 7637 if( info.nKey <= nMaxKey ){ 7638 checkAppendMsg(pCheck, zContext, 7639 "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey); 7640 } 7641 nMaxKey = info.nKey; 7642 } 7643 assert( sz==info.nPayload ); 7644 if( (sz>info.nLocal) 7645 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) 7646 ){ 7647 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 7648 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 7649#ifndef SQLITE_OMIT_AUTOVACUUM 7650 if( pBt->autoVacuum ){ 7651 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); 7652 } 7653#endif 7654 checkList(pCheck, 0, pgnoOvfl, nPage, zContext); 7655 } 7656 7657 /* Check sanity of left child page. 7658 */ 7659 if( !pPage->leaf ){ 7660 pgno = get4byte(pCell); 7661#ifndef SQLITE_OMIT_AUTOVACUUM 7662 if( pBt->autoVacuum ){ 7663 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 7664 } 7665#endif 7666 d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey); 7667 if( i>0 && d2!=depth ){ 7668 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 7669 } 7670 depth = d2; 7671 } 7672 } 7673 7674 if( !pPage->leaf ){ 7675 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 7676 sqlite3_snprintf(sizeof(zContext), zContext, 7677 "On page %d at right child: ", iPage); 7678#ifndef SQLITE_OMIT_AUTOVACUUM 7679 if( pBt->autoVacuum ){ 7680 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); 7681 } 7682#endif 7683 checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey); 7684 } 7685 7686 /* For intKey leaf pages, check that the min/max keys are in order 7687 ** with any left/parent/right pages. 7688 */ 7689 if( pPage->leaf && pPage->intKey ){ 7690 /* if we are a left child page */ 7691 if( pnParentMinKey ){ 7692 /* if we are the left most child page */ 7693 if( !pnParentMaxKey ){ 7694 if( nMaxKey > *pnParentMinKey ){ 7695 checkAppendMsg(pCheck, zContext, 7696 "Rowid %lld out of order (max larger than parent min of %lld)", 7697 nMaxKey, *pnParentMinKey); 7698 } 7699 }else{ 7700 if( nMinKey <= *pnParentMinKey ){ 7701 checkAppendMsg(pCheck, zContext, 7702 "Rowid %lld out of order (min less than parent min of %lld)", 7703 nMinKey, *pnParentMinKey); 7704 } 7705 if( nMaxKey > *pnParentMaxKey ){ 7706 checkAppendMsg(pCheck, zContext, 7707 "Rowid %lld out of order (max larger than parent max of %lld)", 7708 nMaxKey, *pnParentMaxKey); 7709 } 7710 *pnParentMinKey = nMaxKey; 7711 } 7712 /* else if we're a right child page */ 7713 } else if( pnParentMaxKey ){ 7714 if( nMinKey <= *pnParentMaxKey ){ 7715 checkAppendMsg(pCheck, zContext, 7716 "Rowid %lld out of order (min less than parent max of %lld)", 7717 nMinKey, *pnParentMaxKey); 7718 } 7719 } 7720 } 7721 7722 /* Check for complete coverage of the page 7723 */ 7724 data = pPage->aData; 7725 hdr = pPage->hdrOffset; 7726 hit = sqlite3PageMalloc( pBt->pageSize ); 7727 if( hit==0 ){ 7728 pCheck->mallocFailed = 1; 7729 }else{ 7730 int contentOffset = get2byteNotZero(&data[hdr+5]); 7731 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 7732 memset(hit+contentOffset, 0, usableSize-contentOffset); 7733 memset(hit, 1, contentOffset); 7734 nCell = get2byte(&data[hdr+3]); 7735 cellStart = hdr + 12 - 4*pPage->leaf; 7736 for(i=0; i<nCell; i++){ 7737 int pc = get2byte(&data[cellStart+i*2]); 7738 u32 size = 65536; 7739 int j; 7740 if( pc<=usableSize-4 ){ 7741 size = cellSizePtr(pPage, &data[pc]); 7742 } 7743 if( (int)(pc+size-1)>=usableSize ){ 7744 checkAppendMsg(pCheck, 0, 7745 "Corruption detected in cell %d on page %d",i,iPage); 7746 }else{ 7747 for(j=pc+size-1; j>=pc; j--) hit[j]++; 7748 } 7749 } 7750 i = get2byte(&data[hdr+1]); 7751 while( i>0 ){ 7752 int size, j; 7753 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */ 7754 size = get2byte(&data[i+2]); 7755 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ 7756 for(j=i+size-1; j>=i; j--) hit[j]++; 7757 j = get2byte(&data[i]); 7758 assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ 7759 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ 7760 i = j; 7761 } 7762 for(i=cnt=0; i<usableSize; i++){ 7763 if( hit[i]==0 ){ 7764 cnt++; 7765 }else if( hit[i]>1 ){ 7766 checkAppendMsg(pCheck, 0, 7767 "Multiple uses for byte %d of page %d", i, iPage); 7768 break; 7769 } 7770 } 7771 if( cnt!=data[hdr+7] ){ 7772 checkAppendMsg(pCheck, 0, 7773 "Fragmentation of %d bytes reported as %d on page %d", 7774 cnt, data[hdr+7], iPage); 7775 } 7776 } 7777 sqlite3PageFree(hit); 7778 releasePage(pPage); 7779 return depth+1; 7780} 7781#endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7782 7783#ifndef SQLITE_OMIT_INTEGRITY_CHECK 7784/* 7785** This routine does a complete check of the given BTree file. aRoot[] is 7786** an array of pages numbers were each page number is the root page of 7787** a table. nRoot is the number of entries in aRoot. 7788** 7789** A read-only or read-write transaction must be opened before calling 7790** this function. 7791** 7792** Write the number of error seen in *pnErr. Except for some memory 7793** allocation errors, an error message held in memory obtained from 7794** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 7795** returned. If a memory allocation error occurs, NULL is returned. 7796*/ 7797char *sqlite3BtreeIntegrityCheck( 7798 Btree *p, /* The btree to be checked */ 7799 int *aRoot, /* An array of root pages numbers for individual trees */ 7800 int nRoot, /* Number of entries in aRoot[] */ 7801 int mxErr, /* Stop reporting errors after this many */ 7802 int *pnErr /* Write number of errors seen to this variable */ 7803){ 7804 Pgno i; 7805 int nRef; 7806 IntegrityCk sCheck; 7807 BtShared *pBt = p->pBt; 7808 char zErr[100]; 7809 7810 sqlite3BtreeEnter(p); 7811 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 7812 nRef = sqlite3PagerRefcount(pBt->pPager); 7813 sCheck.pBt = pBt; 7814 sCheck.pPager = pBt->pPager; 7815 sCheck.nPage = btreePagecount(sCheck.pBt); 7816 sCheck.mxErr = mxErr; 7817 sCheck.nErr = 0; 7818 sCheck.mallocFailed = 0; 7819 *pnErr = 0; 7820 if( sCheck.nPage==0 ){ 7821 sqlite3BtreeLeave(p); 7822 return 0; 7823 } 7824 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 7825 if( !sCheck.anRef ){ 7826 *pnErr = 1; 7827 sqlite3BtreeLeave(p); 7828 return 0; 7829 } 7830 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 7831 i = PENDING_BYTE_PAGE(pBt); 7832 if( i<=sCheck.nPage ){ 7833 sCheck.anRef[i] = 1; 7834 } 7835 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000); 7836 sCheck.errMsg.useMalloc = 2; 7837 7838 /* Check the integrity of the freelist 7839 */ 7840 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 7841 get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); 7842 7843 /* Check all the tables. 7844 */ 7845 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 7846 if( aRoot[i]==0 ) continue; 7847#ifndef SQLITE_OMIT_AUTOVACUUM 7848 if( pBt->autoVacuum && aRoot[i]>1 ){ 7849 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); 7850 } 7851#endif 7852 checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL); 7853 } 7854 7855 /* Make sure every page in the file is referenced 7856 */ 7857 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 7858#ifdef SQLITE_OMIT_AUTOVACUUM 7859 if( sCheck.anRef[i]==0 ){ 7860 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7861 } 7862#else 7863 /* If the database supports auto-vacuum, make sure no tables contain 7864 ** references to pointer-map pages. 7865 */ 7866 if( sCheck.anRef[i]==0 && 7867 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 7868 checkAppendMsg(&sCheck, 0, "Page %d is never used", i); 7869 } 7870 if( sCheck.anRef[i]!=0 && 7871 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 7872 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); 7873 } 7874#endif 7875 } 7876 7877 /* Make sure this analysis did not leave any unref() pages. 7878 ** This is an internal consistency check; an integrity check 7879 ** of the integrity check. 7880 */ 7881 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ 7882 checkAppendMsg(&sCheck, 0, 7883 "Outstanding page count goes from %d to %d during this analysis", 7884 nRef, sqlite3PagerRefcount(pBt->pPager) 7885 ); 7886 } 7887 7888 /* Clean up and report errors. 7889 */ 7890 sqlite3BtreeLeave(p); 7891 sqlite3_free(sCheck.anRef); 7892 if( sCheck.mallocFailed ){ 7893 sqlite3StrAccumReset(&sCheck.errMsg); 7894 *pnErr = sCheck.nErr+1; 7895 return 0; 7896 } 7897 *pnErr = sCheck.nErr; 7898 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); 7899 return sqlite3StrAccumFinish(&sCheck.errMsg); 7900} 7901#endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7902 7903/* 7904** Return the full pathname of the underlying database file. 7905** 7906** The pager filename is invariant as long as the pager is 7907** open so it is safe to access without the BtShared mutex. 7908*/ 7909const char *sqlite3BtreeGetFilename(Btree *p){ 7910 assert( p->pBt->pPager!=0 ); 7911 return sqlite3PagerFilename(p->pBt->pPager); 7912} 7913 7914/* 7915** Return the pathname of the journal file for this database. The return 7916** value of this routine is the same regardless of whether the journal file 7917** has been created or not. 7918** 7919** The pager journal filename is invariant as long as the pager is 7920** open so it is safe to access without the BtShared mutex. 7921*/ 7922const char *sqlite3BtreeGetJournalname(Btree *p){ 7923 assert( p->pBt->pPager!=0 ); 7924 return sqlite3PagerJournalname(p->pBt->pPager); 7925} 7926 7927/* 7928** Return non-zero if a transaction is active. 7929*/ 7930int sqlite3BtreeIsInTrans(Btree *p){ 7931 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 7932 return (p && (p->inTrans==TRANS_WRITE)); 7933} 7934 7935#ifndef SQLITE_OMIT_WAL 7936/* 7937** Run a checkpoint on the Btree passed as the first argument. 7938** 7939** Return SQLITE_LOCKED if this or any other connection has an open 7940** transaction on the shared-cache the argument Btree is connected to. 7941** 7942** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 7943*/ 7944int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 7945 int rc = SQLITE_OK; 7946 if( p ){ 7947 BtShared *pBt = p->pBt; 7948 sqlite3BtreeEnter(p); 7949 if( pBt->inTransaction!=TRANS_NONE ){ 7950 rc = SQLITE_LOCKED; 7951 }else{ 7952 rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt); 7953 } 7954 sqlite3BtreeLeave(p); 7955 } 7956 return rc; 7957} 7958#endif 7959 7960/* 7961** Return non-zero if a read (or write) transaction is active. 7962*/ 7963int sqlite3BtreeIsInReadTrans(Btree *p){ 7964 assert( p ); 7965 assert( sqlite3_mutex_held(p->db->mutex) ); 7966 return p->inTrans!=TRANS_NONE; 7967} 7968 7969int sqlite3BtreeIsInBackup(Btree *p){ 7970 assert( p ); 7971 assert( sqlite3_mutex_held(p->db->mutex) ); 7972 return p->nBackup!=0; 7973} 7974 7975/* 7976** This function returns a pointer to a blob of memory associated with 7977** a single shared-btree. The memory is used by client code for its own 7978** purposes (for example, to store a high-level schema associated with 7979** the shared-btree). The btree layer manages reference counting issues. 7980** 7981** The first time this is called on a shared-btree, nBytes bytes of memory 7982** are allocated, zeroed, and returned to the caller. For each subsequent 7983** call the nBytes parameter is ignored and a pointer to the same blob 7984** of memory returned. 7985** 7986** If the nBytes parameter is 0 and the blob of memory has not yet been 7987** allocated, a null pointer is returned. If the blob has already been 7988** allocated, it is returned as normal. 7989** 7990** Just before the shared-btree is closed, the function passed as the 7991** xFree argument when the memory allocation was made is invoked on the 7992** blob of allocated memory. The xFree function should not call sqlite3_free() 7993** on the memory, the btree layer does that. 7994*/ 7995void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 7996 BtShared *pBt = p->pBt; 7997 sqlite3BtreeEnter(p); 7998 if( !pBt->pSchema && nBytes ){ 7999 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 8000 pBt->xFreeSchema = xFree; 8001 } 8002 sqlite3BtreeLeave(p); 8003 return pBt->pSchema; 8004} 8005 8006/* 8007** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 8008** btree as the argument handle holds an exclusive lock on the 8009** sqlite_master table. Otherwise SQLITE_OK. 8010*/ 8011int sqlite3BtreeSchemaLocked(Btree *p){ 8012 int rc; 8013 assert( sqlite3_mutex_held(p->db->mutex) ); 8014 sqlite3BtreeEnter(p); 8015 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 8016 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 8017 sqlite3BtreeLeave(p); 8018 return rc; 8019} 8020 8021 8022#ifndef SQLITE_OMIT_SHARED_CACHE 8023/* 8024** Obtain a lock on the table whose root page is iTab. The 8025** lock is a write lock if isWritelock is true or a read lock 8026** if it is false. 8027*/ 8028int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 8029 int rc = SQLITE_OK; 8030 assert( p->inTrans!=TRANS_NONE ); 8031 if( p->sharable ){ 8032 u8 lockType = READ_LOCK + isWriteLock; 8033 assert( READ_LOCK+1==WRITE_LOCK ); 8034 assert( isWriteLock==0 || isWriteLock==1 ); 8035 8036 sqlite3BtreeEnter(p); 8037 rc = querySharedCacheTableLock(p, iTab, lockType); 8038 if( rc==SQLITE_OK ){ 8039 rc = setSharedCacheTableLock(p, iTab, lockType); 8040 } 8041 sqlite3BtreeLeave(p); 8042 } 8043 return rc; 8044} 8045#endif 8046 8047#ifndef SQLITE_OMIT_INCRBLOB 8048/* 8049** Argument pCsr must be a cursor opened for writing on an 8050** INTKEY table currently pointing at a valid table entry. 8051** This function modifies the data stored as part of that entry. 8052** 8053** Only the data content may only be modified, it is not possible to 8054** change the length of the data stored. If this function is called with 8055** parameters that attempt to write past the end of the existing data, 8056** no modifications are made and SQLITE_CORRUPT is returned. 8057*/ 8058int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 8059 int rc; 8060 assert( cursorHoldsMutex(pCsr) ); 8061 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 8062 assert( pCsr->isIncrblobHandle ); 8063 8064 rc = restoreCursorPosition(pCsr); 8065 if( rc!=SQLITE_OK ){ 8066 return rc; 8067 } 8068 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 8069 if( pCsr->eState!=CURSOR_VALID ){ 8070 return SQLITE_ABORT; 8071 } 8072 8073 /* Check some assumptions: 8074 ** (a) the cursor is open for writing, 8075 ** (b) there is a read/write transaction open, 8076 ** (c) the connection holds a write-lock on the table (if required), 8077 ** (d) there are no conflicting read-locks, and 8078 ** (e) the cursor points at a valid row of an intKey table. 8079 */ 8080 if( !pCsr->wrFlag ){ 8081 return SQLITE_READONLY; 8082 } 8083 assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE ); 8084 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 8085 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 8086 assert( pCsr->apPage[pCsr->iPage]->intKey ); 8087 8088 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 8089} 8090 8091/* 8092** Set a flag on this cursor to cache the locations of pages from the 8093** overflow list for the current row. This is used by cursors opened 8094** for incremental blob IO only. 8095** 8096** This function sets a flag only. The actual page location cache 8097** (stored in BtCursor.aOverflow[]) is allocated and used by function 8098** accessPayload() (the worker function for sqlite3BtreeData() and 8099** sqlite3BtreePutData()). 8100*/ 8101void sqlite3BtreeCacheOverflow(BtCursor *pCur){ 8102 assert( cursorHoldsMutex(pCur) ); 8103 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 8104 invalidateOverflowCache(pCur); 8105 pCur->isIncrblobHandle = 1; 8106} 8107#endif 8108 8109/* 8110** Set both the "read version" (single byte at byte offset 18) and 8111** "write version" (single byte at byte offset 19) fields in the database 8112** header to iVersion. 8113*/ 8114int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 8115 BtShared *pBt = pBtree->pBt; 8116 int rc; /* Return code */ 8117 8118 assert( pBtree->inTrans==TRANS_NONE ); 8119 assert( iVersion==1 || iVersion==2 ); 8120 8121 /* If setting the version fields to 1, do not automatically open the 8122 ** WAL connection, even if the version fields are currently set to 2. 8123 */ 8124 pBt->doNotUseWAL = (u8)(iVersion==1); 8125 8126 rc = sqlite3BtreeBeginTrans(pBtree, 0); 8127 if( rc==SQLITE_OK ){ 8128 u8 *aData = pBt->pPage1->aData; 8129 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 8130 rc = sqlite3BtreeBeginTrans(pBtree, 2); 8131 if( rc==SQLITE_OK ){ 8132 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 8133 if( rc==SQLITE_OK ){ 8134 aData[18] = (u8)iVersion; 8135 aData[19] = (u8)iVersion; 8136 } 8137 } 8138 } 8139 } 8140 8141 pBt->doNotUseWAL = 0; 8142 return rc; 8143} 8144