diff options
author | mdm@chromium.org <mdm@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-09-18 18:27:25 +0000 |
---|---|---|
committer | mdm@chromium.org <mdm@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-09-18 18:27:25 +0000 |
commit | 997e22224e1062a4cd39373057a68879a1d7a3ac (patch) | |
tree | a90a9ce4272fc78f2459b1b2c78b52a3f6d4e5d3 /third_party/sqlite/src/btree.c | |
parent | 0d683c611a18dc6ea0e99f38c73b4fb96611041f (diff) | |
download | chromium_src-997e22224e1062a4cd39373057a68879a1d7a3ac.zip chromium_src-997e22224e1062a4cd39373057a68879a1d7a3ac.tar.gz chromium_src-997e22224e1062a4cd39373057a68879a1d7a3ac.tar.bz2 |
Update sqlite to version 3.6.18, porting our patches.
Hopefully this will help to address some valgrind issues.
BUG=none
TEST=none
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@26596 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party/sqlite/src/btree.c')
-rw-r--r-- | third_party/sqlite/src/btree.c | 5455 |
1 files changed, 2968 insertions, 2487 deletions
diff --git a/third_party/sqlite/src/btree.c b/third_party/sqlite/src/btree.c index dc71523..2db3c90 100644 --- a/third_party/sqlite/src/btree.c +++ b/third_party/sqlite/src/btree.c @@ -9,7 +9,7 @@ ** May you share freely, never taking more than you give. ** ************************************************************************* -** $Id: btree.c,v 1.495 2008/08/02 17:36:46 danielk1977 Exp $ +** $Id: btree.c,v 1.705 2009/08/10 03:57:58 shane Exp $ ** ** This file implements a external (disk-based) database using BTrees. ** See the header comment on "btreeInt.h" for additional information. @@ -34,7 +34,7 @@ static const char zPoisonHeader[] = "SQLite poison 3"; ** macro. */ #if 0 -int sqlite3BtreeTrace=0; /* True to enable tracing */ +int sqlite3BtreeTrace=1; /* True to enable tracing */ # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} #else # define TRACE(X) @@ -44,18 +44,17 @@ int sqlite3BtreeTrace=0; /* True to enable tracing */ #ifndef SQLITE_OMIT_SHARED_CACHE /* -** A flag to indicate whether or not shared cache is enabled. Also, -** a list of BtShared objects that are eligible for participation -** in shared cache. The variables have file scope during normal builds, -** but the test harness needs to access these variables so we make them -** global for test builds. +** A list of BtShared objects that are eligible for participation +** in shared cache. This variable has file scope during normal builds, +** but the test harness needs to access it so we make it global for +** test builds. +** +** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER. */ #ifdef SQLITE_TEST -BtShared *sqlite3SharedCacheList = 0; -int sqlite3SharedCacheEnabled = 0; +BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; #else -static BtShared *sqlite3SharedCacheList = 0; -static int sqlite3SharedCacheEnabled = 0; +static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; #endif #endif /* SQLITE_OMIT_SHARED_CACHE */ @@ -68,46 +67,155 @@ static int sqlite3SharedCacheEnabled = 0; ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). */ int sqlite3_enable_shared_cache(int enable){ - sqlite3SharedCacheEnabled = enable; + sqlite3GlobalConfig.sharedCacheEnabled = enable; return SQLITE_OK; } #endif -/* -** Forward declaration -*/ -static int checkReadLocks(Btree*, Pgno, BtCursor*, i64); - #ifdef SQLITE_OMIT_SHARED_CACHE /* - ** The functions queryTableLock(), lockTable() and unlockAllTables() + ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), + ** and clearAllSharedCacheTableLocks() ** manipulate entries in the BtShared.pLock linked list used to store ** shared-cache table level locks. If the library is compiled with the ** shared-cache feature disabled, then there is only ever one user ** of each BtShared structure and so this locking is not necessary. ** So define the lock related functions as no-ops. */ - #define queryTableLock(a,b,c) SQLITE_OK - #define lockTable(a,b,c) SQLITE_OK - #define unlockAllTables(a) + #define querySharedCacheTableLock(a,b,c) SQLITE_OK + #define setSharedCacheTableLock(a,b,c) SQLITE_OK + #define clearAllSharedCacheTableLocks(a) + #define downgradeAllSharedCacheTableLocks(a) + #define hasSharedCacheTableLock(a,b,c,d) 1 + #define hasReadConflicts(a, b) 0 #endif #ifndef SQLITE_OMIT_SHARED_CACHE + +#ifdef SQLITE_DEBUG +/* +** This function is only used as part of an assert() statement. It checks +** that connection p holds the required locks to read or write to the +** b-tree with root page iRoot. If so, true is returned. Otherwise, false. +** For example, when writing to a table b-tree with root-page iRoot via +** Btree connection pBtree: +** +** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); +** +** When writing to an index b-tree that resides in a sharable database, the +** caller should have first obtained a lock specifying the root page of +** the corresponding table b-tree. This makes things a bit more complicated, +** as this module treats each b-tree as a separate structure. To determine +** the table b-tree corresponding to the index b-tree being written, this +** function has to search through the database schema. +** +** Instead of a lock on the b-tree rooted at page iRoot, the caller may +** hold a write-lock on the schema table (root page 1). This is also +** acceptable. +*/ +static int hasSharedCacheTableLock( + Btree *pBtree, /* Handle that must hold lock */ + Pgno iRoot, /* Root page of b-tree */ + int isIndex, /* True if iRoot is the root of an index b-tree */ + int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ +){ + Schema *pSchema = (Schema *)pBtree->pBt->pSchema; + Pgno iTab = 0; + BtLock *pLock; + + /* If this b-tree database is not shareable, or if the client is reading + ** and has the read-uncommitted flag set, then no lock is required. + ** In these cases return true immediately. If the client is reading + ** or writing an index b-tree, but the schema is not loaded, then return + ** true also. In this case the lock is required, but it is too difficult + ** to check if the client actually holds it. This doesn't happen very + ** often. */ + if( (pBtree->sharable==0) + || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted)) + || (isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0 )) + ){ + return 1; + } + + /* Figure out the root-page that the lock should be held on. For table + ** b-trees, this is just the root page of the b-tree being read or + ** written. For index b-trees, it is the root page of the associated + ** table. */ + if( isIndex ){ + HashElem *p; + for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ + Index *pIdx = (Index *)sqliteHashData(p); + if( pIdx->tnum==(int)iRoot ){ + iTab = pIdx->pTable->tnum; + } + } + }else{ + iTab = iRoot; + } + + /* Search for the required lock. Either a write-lock on root-page iTab, a + ** write-lock on the schema table, or (if the client is reading) a + ** read-lock on iTab will suffice. Return 1 if any of these are found. */ + for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ + if( pLock->pBtree==pBtree + && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) + && pLock->eLock>=eLockType + ){ + return 1; + } + } + + /* Failed to find the required lock. */ + return 0; +} + +/* +** This function is also used as part of assert() statements only. It +** returns true if there exist one or more cursors open on the table +** with root page iRoot that do not belong to either connection pBtree +** or some other connection that has the read-uncommitted flag set. +** +** For example, before writing to page iRoot: +** +** assert( !hasReadConflicts(pBtree, iRoot) ); +*/ +static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ + BtCursor *p; + for(p=pBtree->pBt->pCursor; p; p=p->pNext){ + if( p->pgnoRoot==iRoot + && p->pBtree!=pBtree + && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted) + ){ + return 1; + } + } + return 0; +} +#endif /* #ifdef SQLITE_DEBUG */ + /* ** Query to see if btree handle p may obtain a lock of type eLock ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return -** SQLITE_OK if the lock may be obtained (by calling lockTable()), or -** SQLITE_LOCKED if not. +** SQLITE_OK if the lock may be obtained (by calling +** setSharedCacheTableLock()), or SQLITE_LOCKED if not. */ -static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ +static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ BtShared *pBt = p->pBt; BtLock *pIter; assert( sqlite3BtreeHoldsMutex(p) ); assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); assert( p->db!=0 ); + assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 ); + + /* If requesting a write-lock, then the Btree must have an open write + ** transaction on this file. And, obviously, for this to be so there + ** must be an open write transaction on the file itself. + */ + assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); + assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); /* This is a no-op if the shared-cache is not enabled */ if( !p->sharable ){ @@ -117,34 +225,30 @@ static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ /* If some other connection is holding an exclusive lock, the ** requested lock may not be obtained. */ - if( pBt->pExclusive && pBt->pExclusive!=p ){ - return SQLITE_LOCKED; + if( pBt->pWriter!=p && pBt->isExclusive ){ + sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); + return SQLITE_LOCKED_SHAREDCACHE; } - /* This (along with lockTable()) is where the ReadUncommitted flag is - ** dealt with. If the caller is querying for a read-lock and the flag is - ** set, it is unconditionally granted - even if there are write-locks - ** on the table. If a write-lock is requested, the ReadUncommitted flag - ** is not considered. - ** - ** In function lockTable(), if a read-lock is demanded and the - ** ReadUncommitted flag is set, no entry is added to the locks list - ** (BtShared.pLock). - ** - ** To summarize: If the ReadUncommitted flag is set, then read cursors do - ** not create or respect table locks. The locking procedure for a - ** write-cursor does not change. - */ - if( - 0==(p->db->flags&SQLITE_ReadUncommitted) || - eLock==WRITE_LOCK || - iTab==MASTER_ROOT - ){ - for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ - if( pIter->pBtree!=p && pIter->iTable==iTab && - (pIter->eLock!=eLock || eLock!=READ_LOCK) ){ - return SQLITE_LOCKED; + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ + /* The condition (pIter->eLock!=eLock) in the following if(...) + ** statement is a simplification of: + ** + ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) + ** + ** since we know that if eLock==WRITE_LOCK, then no other connection + ** may hold a WRITE_LOCK on any table in this file (since there can + ** only be a single writer). + */ + assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); + assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); + if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ + sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); + if( eLock==WRITE_LOCK ){ + assert( p==pBt->pWriter ); + pBt->isPending = 1; } + return SQLITE_LOCKED_SHAREDCACHE; } } return SQLITE_OK; @@ -157,10 +261,19 @@ static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){ ** by Btree handle p. Parameter eLock must be either READ_LOCK or ** WRITE_LOCK. ** -** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and -** SQLITE_NOMEM may also be returned. +** This function assumes the following: +** +** (a) The specified b-tree connection handle is connected to a sharable +** b-tree database (one with the BtShared.sharable) flag set, and +** +** (b) No other b-tree connection handle holds a lock that conflicts +** with the requested lock (i.e. querySharedCacheTableLock() has +** already been called and returned SQLITE_OK). +** +** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM +** is returned if a malloc attempt fails. */ -static int lockTable(Btree *p, Pgno iTable, u8 eLock){ +static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ BtShared *pBt = p->pBt; BtLock *pLock = 0; BtLock *pIter; @@ -169,25 +282,16 @@ static int lockTable(Btree *p, Pgno iTable, u8 eLock){ assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); assert( p->db!=0 ); - /* This is a no-op if the shared-cache is not enabled */ - if( !p->sharable ){ - return SQLITE_OK; - } + /* A connection with the read-uncommitted flag set will never try to + ** obtain a read-lock using this function. The only read-lock obtained + ** by a connection in read-uncommitted mode is on the sqlite_master + ** table, and that lock is obtained in BtreeBeginTrans(). */ + assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK ); - assert( SQLITE_OK==queryTableLock(p, iTable, eLock) ); - - /* If the read-uncommitted flag is set and a read-lock is requested, - ** return early without adding an entry to the BtShared.pLock list. See - ** comment in function queryTableLock() for more info on handling - ** the ReadUncommitted flag. - */ - if( - (p->db->flags&SQLITE_ReadUncommitted) && - (eLock==READ_LOCK) && - iTable!=MASTER_ROOT - ){ - return SQLITE_OK; - } + /* This function should only be called on a sharable b-tree after it + ** has been determined that no other b-tree holds a conflicting lock. */ + assert( p->sharable ); + assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); /* First search the list for an existing lock on this table. */ for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ @@ -226,31 +330,72 @@ static int lockTable(Btree *p, Pgno iTable, u8 eLock){ #ifndef SQLITE_OMIT_SHARED_CACHE /* -** Release all the table locks (locks obtained via calls to the lockTable() -** procedure) held by Btree handle p. +** Release all the table locks (locks obtained via calls to +** the setSharedCacheTableLock() procedure) held by Btree handle p. +** +** This function assumes that handle p has an open read or write +** transaction. If it does not, then the BtShared.isPending variable +** may be incorrectly cleared. */ -static void unlockAllTables(Btree *p){ +static void clearAllSharedCacheTableLocks(Btree *p){ BtShared *pBt = p->pBt; BtLock **ppIter = &pBt->pLock; assert( sqlite3BtreeHoldsMutex(p) ); assert( p->sharable || 0==*ppIter ); + assert( p->inTrans>0 ); while( *ppIter ){ BtLock *pLock = *ppIter; - assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree ); + assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree ); + assert( pLock->pBtree->inTrans>=pLock->eLock ); if( pLock->pBtree==p ){ *ppIter = pLock->pNext; - sqlite3_free(pLock); + assert( pLock->iTable!=1 || pLock==&p->lock ); + if( pLock->iTable!=1 ){ + sqlite3_free(pLock); + } }else{ ppIter = &pLock->pNext; } } - if( pBt->pExclusive==p ){ - pBt->pExclusive = 0; + assert( pBt->isPending==0 || pBt->pWriter ); + if( pBt->pWriter==p ){ + pBt->pWriter = 0; + pBt->isExclusive = 0; + pBt->isPending = 0; + }else if( pBt->nTransaction==2 ){ + /* This function is called when connection p is concluding its + ** transaction. If there currently exists a writer, and p is not + ** that writer, then the number of locks held by connections other + ** than the writer must be about to drop to zero. In this case + ** set the isPending flag to 0. + ** + ** If there is not currently a writer, then BtShared.isPending must + ** be zero already. So this next line is harmless in that case. + */ + pBt->isPending = 0; + } +} + +/* +** This function changes all write-locks held by connection p to read-locks. +*/ +static void downgradeAllSharedCacheTableLocks(Btree *p){ + BtShared *pBt = p->pBt; + if( pBt->pWriter==p ){ + BtLock *pLock; + pBt->pWriter = 0; + pBt->isExclusive = 0; + pBt->isPending = 0; + for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ + assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); + pLock->eLock = READ_LOCK; + } } } + #endif /* SQLITE_OMIT_SHARED_CACHE */ static void releasePage(MemPage *pPage); /* Forward reference */ @@ -286,14 +431,121 @@ static void invalidateAllOverflowCache(BtShared *pBt){ invalidateOverflowCache(p); } } + +/* +** This function is called before modifying the contents of a table +** b-tree to invalidate any incrblob cursors that are open on the +** row or one of the rows being modified. +** +** If argument isClearTable is true, then the entire contents of the +** table is about to be deleted. In this case invalidate all incrblob +** cursors open on any row within the table with root-page pgnoRoot. +** +** Otherwise, if argument isClearTable is false, then the row with +** rowid iRow is being replaced or deleted. In this case invalidate +** only those incrblob cursors open on this specific row. +*/ +static void invalidateIncrblobCursors( + Btree *pBtree, /* The database file to check */ + i64 iRow, /* The rowid that might be changing */ + int isClearTable /* True if all rows are being deleted */ +){ + BtCursor *p; + BtShared *pBt = pBtree->pBt; + assert( sqlite3BtreeHoldsMutex(pBtree) ); + for(p=pBt->pCursor; p; p=p->pNext){ + if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){ + p->eState = CURSOR_INVALID; + } + } +} + #else #define invalidateOverflowCache(x) #define invalidateAllOverflowCache(x) + #define invalidateIncrblobCursors(x,y,z) #endif /* +** Set bit pgno of the BtShared.pHasContent bitvec. This is called +** when a page that previously contained data becomes a free-list leaf +** page. +** +** The BtShared.pHasContent bitvec exists to work around an obscure +** bug caused by the interaction of two useful IO optimizations surrounding +** free-list leaf pages: +** +** 1) When all data is deleted from a page and the page becomes +** a free-list leaf page, the page is not written to the database +** (as free-list leaf pages contain no meaningful data). Sometimes +** such a page is not even journalled (as it will not be modified, +** why bother journalling it?). +** +** 2) When a free-list leaf page is reused, its content is not read +** from the database or written to the journal file (why should it +** be, if it is not at all meaningful?). +** +** By themselves, these optimizations work fine and provide a handy +** performance boost to bulk delete or insert operations. However, if +** a page is moved to the free-list and then reused within the same +** transaction, a problem comes up. If the page is not journalled when +** it is moved to the free-list and it is also not journalled when it +** is extracted from the free-list and reused, then the original data +** may be lost. In the event of a rollback, it may not be possible +** to restore the database to its original configuration. +** +** The solution is the BtShared.pHasContent bitvec. Whenever a page is +** moved to become a free-list leaf page, the corresponding bit is +** set in the bitvec. Whenever a leaf page is extracted from the free-list, +** optimization 2 above is ommitted if the corresponding bit is already +** set in BtShared.pHasContent. The contents of the bitvec are cleared +** at the end of every transaction. +*/ +static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ + int rc = SQLITE_OK; + if( !pBt->pHasContent ){ + int nPage = 100; + sqlite3PagerPagecount(pBt->pPager, &nPage); + /* If sqlite3PagerPagecount() fails there is no harm because the + ** nPage variable is unchanged from its default value of 100 */ + pBt->pHasContent = sqlite3BitvecCreate((u32)nPage); + if( !pBt->pHasContent ){ + rc = SQLITE_NOMEM; + } + } + if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ + rc = sqlite3BitvecSet(pBt->pHasContent, pgno); + } + return rc; +} + +/* +** Query the BtShared.pHasContent vector. +** +** This function is called when a free-list leaf page is removed from the +** free-list for reuse. It returns false if it is safe to retrieve the +** page from the pager layer with the 'no-content' flag set. True otherwise. +*/ +static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ + Bitvec *p = pBt->pHasContent; + return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); +} + +/* +** Clear (destroy) the BtShared.pHasContent bitvec. This should be +** invoked at the conclusion of each write-transaction. +*/ +static void btreeClearHasContent(BtShared *pBt){ + sqlite3BitvecDestroy(pBt->pHasContent); + pBt->pHasContent = 0; +} + +/* ** Save the current cursor position in the variables BtCursor.nKey ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. +** +** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) +** prior to calling this routine. */ static int saveCursorPosition(BtCursor *pCur){ int rc; @@ -303,6 +555,7 @@ static int saveCursorPosition(BtCursor *pCur){ assert( cursorHoldsMutex(pCur) ); rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); + assert( rc==SQLITE_OK ); /* KeySize() cannot fail */ /* If this is an intKey table, then the above call to BtreeKeySize() ** stores the integer key in pCur->nKey. In this case this value is @@ -310,10 +563,10 @@ static int saveCursorPosition(BtCursor *pCur){ ** table, then malloc space for and store the pCur->nKey bytes of key ** data. */ - if( rc==SQLITE_OK && 0==pCur->pPage->intKey){ - void *pKey = sqlite3Malloc(pCur->nKey); + if( 0==pCur->apPage[0]->intKey ){ + void *pKey = sqlite3Malloc( (int)pCur->nKey ); if( pKey ){ - rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey); + rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); if( rc==SQLITE_OK ){ pCur->pKey = pKey; }else{ @@ -323,11 +576,15 @@ static int saveCursorPosition(BtCursor *pCur){ rc = SQLITE_NOMEM; } } - assert( !pCur->pPage->intKey || !pCur->pKey ); + assert( !pCur->apPage[0]->intKey || !pCur->pKey ); if( rc==SQLITE_OK ){ - releasePage(pCur->pPage); - pCur->pPage = 0; + int i; + for(i=0; i<=pCur->iPage; i++){ + releasePage(pCur->apPage[i]); + pCur->apPage[i] = 0; + } + pCur->iPage = -1; pCur->eState = CURSOR_REQUIRESEEK; } @@ -359,7 +616,7 @@ static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ /* ** Clear the current cursor position. */ -static void clearCursorPosition(BtCursor *pCur){ +void sqlite3BtreeClearCursor(BtCursor *pCur){ assert( cursorHoldsMutex(pCur) ); sqlite3_free(pCur->pKey); pCur->pKey = 0; @@ -367,21 +624,52 @@ static void clearCursorPosition(BtCursor *pCur){ } /* +** In this version of BtreeMoveto, pKey is a packed index record +** such as is generated by the OP_MakeRecord opcode. Unpack the +** record and then call BtreeMovetoUnpacked() to do the work. +*/ +static int btreeMoveto( + BtCursor *pCur, /* Cursor open on the btree to be searched */ + const void *pKey, /* Packed key if the btree is an index */ + i64 nKey, /* Integer key for tables. Size of pKey for indices */ + int bias, /* Bias search to the high end */ + int *pRes /* Write search results here */ +){ + int rc; /* Status code */ + UnpackedRecord *pIdxKey; /* Unpacked index key */ + char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */ + + if( pKey ){ + assert( nKey==(i64)(int)nKey ); + pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, + aSpace, sizeof(aSpace)); + if( pIdxKey==0 ) return SQLITE_NOMEM; + }else{ + pIdxKey = 0; + } + rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); + if( pKey ){ + sqlite3VdbeDeleteUnpackedRecord(pIdxKey); + } + return rc; +} + +/* ** Restore the cursor to the position it was in (or as close to as possible) ** when saveCursorPosition() was called. Note that this call deletes the ** saved position info stored by saveCursorPosition(), so there can be ** at most one effective restoreCursorPosition() call after each ** saveCursorPosition(). */ -int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){ +static int btreeRestoreCursorPosition(BtCursor *pCur){ int rc; assert( cursorHoldsMutex(pCur) ); assert( pCur->eState>=CURSOR_REQUIRESEEK ); if( pCur->eState==CURSOR_FAULT ){ - return pCur->skip; + return pCur->skipNext; } pCur->eState = CURSOR_INVALID; - rc = sqlite3BtreeMoveto(pCur, pCur->pKey, 0, pCur->nKey, 0, &pCur->skip); + rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext); if( rc==SQLITE_OK ){ sqlite3_free(pCur->pKey); pCur->pKey = 0; @@ -392,12 +680,12 @@ int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){ #define restoreCursorPosition(p) \ (p->eState>=CURSOR_REQUIRESEEK ? \ - sqlite3BtreeRestoreCursorPosition(p) : \ + btreeRestoreCursorPosition(p) : \ SQLITE_OK) /* ** Determine whether or not a cursor has moved from the position it -** was last placed at. Cursor can move when the row they are pointing +** was last placed at. Cursors can move when the row they are pointing ** at is deleted out from under them. ** ** This routine returns an error code if something goes wrong. The @@ -411,7 +699,7 @@ int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ *pHasMoved = 1; return rc; } - if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){ + if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){ *pHasMoved = 1; }else{ *pHasMoved = 0; @@ -426,7 +714,8 @@ int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ ** input page number. */ static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ - int nPagesPerMapPage, iPtrMap, ret; + int nPagesPerMapPage; + Pgno iPtrMap, ret; assert( sqlite3_mutex_held(pBt->mutex) ); nPagesPerMapPage = (pBt->usableSize/5)+1; iPtrMap = (pgno-2)/nPagesPerMapPage; @@ -442,14 +731,19 @@ static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ ** ** This routine updates the pointer map entry for page number 'key' ** so that it maps to type 'eType' and parent page number 'pgno'. -** An error code is returned if something goes wrong, otherwise SQLITE_OK. +** +** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is +** a no-op. If an error occurs, the appropriate error code is written +** into *pRC. */ -static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ +static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ DbPage *pDbPage; /* The pointer map page */ u8 *pPtrmap; /* The pointer map data */ Pgno iPtrmap; /* The pointer map page number */ int offset; /* Offset in pointer map page */ - int rc; + int rc; /* Return code from subfunctions */ + + if( *pRC ) return; assert( sqlite3_mutex_held(pBt->mutex) ); /* The master-journal page number must never be used as a pointer map page */ @@ -457,27 +751,33 @@ static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){ assert( pBt->autoVacuum ); if( key==0 ){ - return SQLITE_CORRUPT_BKPT; + *pRC = SQLITE_CORRUPT_BKPT; + return; } iPtrmap = PTRMAP_PAGENO(pBt, key); rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); if( rc!=SQLITE_OK ){ - return rc; + *pRC = rc; + return; } offset = PTRMAP_PTROFFSET(iPtrmap, key); + if( offset<0 ){ + *pRC = SQLITE_CORRUPT_BKPT; + goto ptrmap_exit; + } pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); - rc = sqlite3PagerWrite(pDbPage); + *pRC= rc = sqlite3PagerWrite(pDbPage); if( rc==SQLITE_OK ){ pPtrmap[offset] = eType; put4byte(&pPtrmap[offset+1], parent); } } +ptrmap_exit: sqlite3PagerUnref(pDbPage); - return rc; } /* @@ -514,9 +814,9 @@ static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ } #else /* if defined SQLITE_OMIT_AUTOVACUUM */ - #define ptrmapPut(w,x,y,z) SQLITE_OK + #define ptrmapPut(w,x,y,z,rc) #define ptrmapGet(w,x,y,z) SQLITE_OK - #define ptrmapPutOvfl(y,z) SQLITE_OK + #define ptrmapPutOvflPtr(x, y, rc) #endif /* @@ -531,7 +831,7 @@ static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ /* ** This a more complex version of findCell() that works for -** pages that do contain overflow cells. See insert +** pages that do contain overflow cells. */ static u8 *findOverflowCell(MemPage *pPage, int iCell){ int i; @@ -553,19 +853,19 @@ static u8 *findOverflowCell(MemPage *pPage, int iCell){ /* ** Parse a cell content block and fill in the CellInfo structure. There -** are two versions of this function. sqlite3BtreeParseCell() takes a -** cell index as the second argument and sqlite3BtreeParseCellPtr() +** are two versions of this function. btreeParseCell() takes a +** cell index as the second argument and btreeParseCellPtr() ** takes a pointer to the body of the cell as its second argument. ** ** Within this file, the parseCell() macro can be called instead of -** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster. +** btreeParseCellPtr(). Using some compilers, this will be faster. */ -void sqlite3BtreeParseCellPtr( +static void btreeParseCellPtr( MemPage *pPage, /* Page containing the cell */ u8 *pCell, /* Pointer to the cell text. */ CellInfo *pInfo /* Fill in this structure */ ){ - int n; /* Number bytes in cell content header */ + u16 n; /* Number bytes in cell content header */ u32 nPayload; /* Number of bytes of cell payload */ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); @@ -589,18 +889,20 @@ void sqlite3BtreeParseCellPtr( } pInfo->nPayload = nPayload; pInfo->nHeader = n; + testcase( nPayload==pPage->maxLocal ); + testcase( nPayload==pPage->maxLocal+1 ); if( likely(nPayload<=pPage->maxLocal) ){ /* This is the (easy) common case where the entire payload fits ** on the local page. No overflow is required. */ int nSize; /* Total size of cell content in bytes */ nSize = nPayload + n; - pInfo->nLocal = nPayload; + pInfo->nLocal = (u16)nPayload; pInfo->iOverflow = 0; if( (nSize & ~3)==0 ){ nSize = 4; /* Minimum cell size is 4 */ } - pInfo->nSize = nSize; + pInfo->nSize = (u16)nSize; }else{ /* If the payload will not fit completely on the local page, we have ** to decide how much to store locally and how much to spill onto @@ -618,18 +920,20 @@ void sqlite3BtreeParseCellPtr( minLocal = pPage->minLocal; maxLocal = pPage->maxLocal; surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); + testcase( surplus==maxLocal ); + testcase( surplus==maxLocal+1 ); if( surplus <= maxLocal ){ - pInfo->nLocal = surplus; + pInfo->nLocal = (u16)surplus; }else{ - pInfo->nLocal = minLocal; + pInfo->nLocal = (u16)minLocal; } - pInfo->iOverflow = pInfo->nLocal + n; + pInfo->iOverflow = (u16)(pInfo->nLocal + n); pInfo->nSize = pInfo->iOverflow + 4; } } #define parseCell(pPage, iCell, pInfo) \ - sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) -void sqlite3BtreeParseCell( + btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) +static void btreeParseCell( MemPage *pPage, /* Page containing the cell */ int iCell, /* The cell index. First cell is 0 */ CellInfo *pInfo /* Fill in this structure */ @@ -643,18 +947,63 @@ void sqlite3BtreeParseCell( ** data header and the local payload, but not any overflow page or ** the space used by the cell pointer. */ +static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ + u8 *pIter = &pCell[pPage->childPtrSize]; + u32 nSize; + +#ifdef SQLITE_DEBUG + /* The value returned by this function should always be the same as + ** the (CellInfo.nSize) value found by doing a full parse of the + ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of + ** this function verifies that this invariant is not violated. */ + CellInfo debuginfo; + btreeParseCellPtr(pPage, pCell, &debuginfo); +#endif + + if( pPage->intKey ){ + u8 *pEnd; + if( pPage->hasData ){ + pIter += getVarint32(pIter, nSize); + }else{ + nSize = 0; + } + + /* pIter now points at the 64-bit integer key value, a variable length + ** integer. The following block moves pIter to point at the first byte + ** past the end of the key value. */ + pEnd = &pIter[9]; + while( (*pIter++)&0x80 && pIter<pEnd ); + }else{ + pIter += getVarint32(pIter, nSize); + } + + testcase( nSize==pPage->maxLocal ); + testcase( nSize==pPage->maxLocal+1 ); + if( nSize>pPage->maxLocal ){ + int minLocal = pPage->minLocal; + nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); + testcase( nSize==pPage->maxLocal ); + testcase( nSize==pPage->maxLocal+1 ); + if( nSize>pPage->maxLocal ){ + nSize = minLocal; + } + nSize += 4; + } + nSize += (u32)(pIter - pCell); + + /* The minimum size of any cell is 4 bytes. */ + if( nSize<4 ){ + nSize = 4; + } + + assert( nSize==debuginfo.nSize ); + return (u16)nSize; +} #ifndef NDEBUG static u16 cellSize(MemPage *pPage, int iCell){ - CellInfo info; - sqlite3BtreeParseCell(pPage, iCell, &info); - return info.nSize; + return cellSizePtr(pPage, findCell(pPage, iCell)); } #endif -static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ - CellInfo info; - sqlite3BtreeParseCellPtr(pPage, pCell, &info); - return info.nSize; -} #ifndef SQLITE_OMIT_AUTOVACUUM /* @@ -662,27 +1011,16 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ ** to an overflow page, insert an entry into the pointer-map ** for the overflow page. */ -static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ +static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ CellInfo info; + if( *pRC ) return; assert( pCell!=0 ); - sqlite3BtreeParseCellPtr(pPage, pCell, &info); + btreeParseCellPtr(pPage, pCell, &info); assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); - if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ + if( info.iOverflow ){ Pgno ovfl = get4byte(&pCell[info.iOverflow]); - return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno); + ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); } - return SQLITE_OK; -} -/* -** If the cell with index iCell on page pPage contains a pointer -** to an overflow page, insert an entry into the pointer-map -** for the overflow page. -*/ -static int ptrmapPutOvfl(MemPage *pPage, int iCell){ - u8 *pCell; - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - pCell = findOverflowCell(pPage, iCell); - return ptrmapPutOvflPtr(pPage, pCell); } #endif @@ -693,18 +1031,20 @@ static int ptrmapPutOvfl(MemPage *pPage, int iCell){ ** big FreeBlk that occurs in between the header and cell ** pointer array and the cell content area. */ -static void defragmentPage(MemPage *pPage){ +static int defragmentPage(MemPage *pPage){ int i; /* Loop counter */ int pc; /* Address of a i-th cell */ - int addr; /* Offset of first byte after cell pointer array */ int hdr; /* Offset to the page header */ int size; /* Size of a cell */ int usableSize; /* Number of usable bytes on a page */ int cellOffset; /* Offset to the cell pointer array */ - int brk; /* Offset to the cell content area */ + int cbrk; /* Offset to the cell content area */ int nCell; /* Number of cells on the page */ unsigned char *data; /* The page data */ unsigned char *temp; /* Temp area for cell content */ + int iCellFirst; /* First allowable cell index */ + int iCellLast; /* Last possible cell index */ + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt!=0 ); @@ -718,94 +1058,150 @@ static void defragmentPage(MemPage *pPage){ nCell = pPage->nCell; assert( nCell==get2byte(&data[hdr+3]) ); usableSize = pPage->pBt->usableSize; - brk = get2byte(&data[hdr+5]); - memcpy(&temp[brk], &data[brk], usableSize - brk); - brk = usableSize; + cbrk = get2byte(&data[hdr+5]); + memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); + cbrk = usableSize; + iCellFirst = cellOffset + 2*nCell; + iCellLast = usableSize - 4; for(i=0; i<nCell; i++){ u8 *pAddr; /* The i-th cell pointer */ pAddr = &data[cellOffset + i*2]; pc = get2byte(pAddr); - assert( pc<pPage->pBt->usableSize ); + testcase( pc==iCellFirst ); + testcase( pc==iCellLast ); +#if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) + /* These conditions have already been verified in btreeInitPage() + ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined + */ + if( pc<iCellFirst || pc>iCellLast ){ + return SQLITE_CORRUPT_BKPT; + } +#endif + assert( pc>=iCellFirst && pc<=iCellLast ); size = cellSizePtr(pPage, &temp[pc]); - brk -= size; - memcpy(&data[brk], &temp[pc], size); - put2byte(pAddr, brk); - } - assert( brk>=cellOffset+2*nCell ); - put2byte(&data[hdr+5], brk); + cbrk -= size; +#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) + if( cbrk<iCellFirst ){ + return SQLITE_CORRUPT_BKPT; + } +#else + if( cbrk<iCellFirst || pc+size>usableSize ){ + return SQLITE_CORRUPT_BKPT; + } +#endif + assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); + testcase( cbrk+size==usableSize ); + testcase( pc+size==usableSize ); + memcpy(&data[cbrk], &temp[pc], size); + put2byte(pAddr, cbrk); + } + assert( cbrk>=iCellFirst ); + put2byte(&data[hdr+5], cbrk); data[hdr+1] = 0; data[hdr+2] = 0; data[hdr+7] = 0; - addr = cellOffset+2*nCell; - memset(&data[addr], 0, brk-addr); + memset(&data[iCellFirst], 0, cbrk-iCellFirst); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + if( cbrk-iCellFirst!=pPage->nFree ){ + return SQLITE_CORRUPT_BKPT; + } + return SQLITE_OK; } /* -** Allocate nByte bytes of space on a page. +** Allocate nByte bytes of space from within the B-Tree page passed +** as the first argument. Write into *pIdx the index into pPage->aData[] +** of the first byte of allocated space. Return either SQLITE_OK or +** an error code (usually SQLITE_CORRUPT). ** -** Return the index into pPage->aData[] of the first byte of -** the new allocation. The caller guarantees that there is enough -** space. This routine will never fail. -** -** If the page contains nBytes of free space but does not contain -** nBytes of contiguous free space, then this routine automatically -** calls defragementPage() to consolidate all free space before -** allocating the new chunk. +** The caller guarantees that there is sufficient space to make the +** allocation. This routine might need to defragment in order to bring +** all the space together, however. This routine will avoid using +** the first two bytes past the cell pointer area since presumably this +** allocation is being made in order to insert a new cell, so we will +** also end up needing a new cell pointer. */ -static int allocateSpace(MemPage *pPage, int nByte){ - int addr, pc, hdr; - int size; - int nFrag; - int top; - int nCell; - int cellOffset; - unsigned char *data; +static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ + const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ + u8 * const data = pPage->aData; /* Local cache of pPage->aData */ + int nFrag; /* Number of fragmented bytes on pPage */ + int top; /* First byte of cell content area */ + int gap; /* First byte of gap between cell pointers and cell content */ + int rc; /* Integer return code */ - data = pPage->aData; assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( nByte>=0 ); /* Minimum cell size is 4 */ assert( pPage->nFree>=nByte ); assert( pPage->nOverflow==0 ); - pPage->nFree -= nByte; - hdr = pPage->hdrOffset; + assert( nByte<pPage->pBt->usableSize-8 ); nFrag = data[hdr+7]; - if( nFrag<60 ){ - /* Search the freelist looking for a slot big enough to satisfy the - ** space request. */ - addr = hdr+1; - while( (pc = get2byte(&data[addr]))>0 ){ - size = get2byte(&data[pc+2]); + assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); + gap = pPage->cellOffset + 2*pPage->nCell; + top = get2byte(&data[hdr+5]); + if( gap>top ) return SQLITE_CORRUPT_BKPT; + testcase( gap+2==top ); + testcase( gap+1==top ); + testcase( gap==top ); + + if( nFrag>=60 ){ + /* Always defragment highly fragmented pages */ + rc = defragmentPage(pPage); + if( rc ) return rc; + top = get2byte(&data[hdr+5]); + }else if( gap+2<=top ){ + /* Search the freelist looking for a free slot big enough to satisfy + ** the request. The allocation is made from the first free slot in + ** the list that is large enough to accomadate it. + */ + int pc, addr; + for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ + int size = get2byte(&data[pc+2]); /* Size of free slot */ if( size>=nByte ){ - if( size<nByte+4 ){ + int x = size - nByte; + testcase( x==4 ); + testcase( x==3 ); + if( x<4 ){ + /* Remove the slot from the free-list. Update the number of + ** fragmented bytes within the page. */ memcpy(&data[addr], &data[pc], 2); - data[hdr+7] = nFrag + size - nByte; - return pc; + data[hdr+7] = (u8)(nFrag + x); }else{ - put2byte(&data[pc+2], size-nByte); - return pc + size - nByte; + /* The slot remains on the free-list. Reduce its size to account + ** for the portion used by the new allocation. */ + put2byte(&data[pc+2], x); } + *pIdx = pc + x; + return SQLITE_OK; } - addr = pc; } } - /* Allocate memory from the gap in between the cell pointer array - ** and the cell content area. + /* Check to make sure there is enough space in the gap to satisfy + ** the allocation. If not, defragment. */ - top = get2byte(&data[hdr+5]); - nCell = get2byte(&data[hdr+3]); - cellOffset = pPage->cellOffset; - if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){ - defragmentPage(pPage); + testcase( gap+2+nByte==top ); + if( gap+2+nByte>top ){ + rc = defragmentPage(pPage); + if( rc ) return rc; top = get2byte(&data[hdr+5]); + assert( gap+nByte<=top ); } + + + /* Allocate memory from the gap in between the cell pointer array + ** and the cell content area. The btreeInitPage() call has already + ** validated the freelist. Given that the freelist is valid, there + ** is no way that the allocation can extend off the end of the page. + ** The assert() below verifies the previous sentence. + */ top -= nByte; - assert( cellOffset + 2*nCell <= top ); put2byte(&data[hdr+5], top); - return top; + assert( top+nByte <= pPage->pBt->usableSize ); + *pIdx = top; + return SQLITE_OK; } /* @@ -816,13 +1212,14 @@ static int allocateSpace(MemPage *pPage, int nByte){ ** Most of the effort here is involved in coalesing adjacent ** free blocks into a single big free block. */ -static void freeSpace(MemPage *pPage, int start, int size){ +static int freeSpace(MemPage *pPage, int start, int size){ int addr, pbegin, hdr; + int iLast; /* Largest possible freeblock offset */ unsigned char *data = pPage->aData; assert( pPage->pBt!=0 ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); - assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) ); + assert( start>=pPage->hdrOffset+6+pPage->childPtrSize ); assert( (start + size)<=pPage->pBt->usableSize ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( size>=0 ); /* Minimum cell size is 4 */ @@ -833,35 +1230,52 @@ static void freeSpace(MemPage *pPage, int start, int size){ memset(&data[start], 0, size); #endif - /* Add the space back into the linked list of freeblocks */ + /* Add the space back into the linked list of freeblocks. Note that + ** even though the freeblock list was checked by btreeInitPage(), + ** btreeInitPage() did not detect overlapping cells or + ** freeblocks that overlapped cells. Nor does it detect when the + ** cell content area exceeds the value in the page header. If these + ** situations arise, then subsequent insert operations might corrupt + ** the freelist. So we do need to check for corruption while scanning + ** the freelist. + */ hdr = pPage->hdrOffset; addr = hdr + 1; + iLast = pPage->pBt->usableSize - 4; + assert( start<=iLast ); while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ - assert( pbegin<=pPage->pBt->usableSize-4 ); - assert( pbegin>addr ); + if( pbegin<addr+4 ){ + return SQLITE_CORRUPT_BKPT; + } addr = pbegin; } - assert( pbegin<=pPage->pBt->usableSize-4 ); + if( pbegin>iLast ){ + return SQLITE_CORRUPT_BKPT; + } assert( pbegin>addr || pbegin==0 ); put2byte(&data[addr], start); put2byte(&data[start], pbegin); put2byte(&data[start+2], size); - pPage->nFree += size; + pPage->nFree = pPage->nFree + (u16)size; /* Coalesce adjacent free blocks */ - addr = pPage->hdrOffset + 1; + addr = hdr + 1; while( (pbegin = get2byte(&data[addr]))>0 ){ - int pnext, psize; + int pnext, psize, x; assert( pbegin>addr ); assert( pbegin<=pPage->pBt->usableSize-4 ); pnext = get2byte(&data[pbegin]); psize = get2byte(&data[pbegin+2]); if( pbegin + psize + 3 >= pnext && pnext>0 ){ int frag = pnext - (pbegin+psize); - assert( frag<=data[pPage->hdrOffset+7] ); - data[pPage->hdrOffset+7] -= frag; - put2byte(&data[pbegin], get2byte(&data[pnext])); - put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin); + if( (frag<0) || (frag>(int)data[hdr+7]) ){ + return SQLITE_CORRUPT_BKPT; + } + data[hdr+7] -= (u8)frag; + x = get2byte(&data[pnext]); + put2byte(&data[pbegin], x); + x = pnext + get2byte(&data[pnext+2]) - pbegin; + put2byte(&data[pbegin+2], x); }else{ addr = pbegin; } @@ -872,9 +1286,11 @@ static void freeSpace(MemPage *pPage, int start, int size){ int top; pbegin = get2byte(&data[hdr+1]); memcpy(&data[hdr+1], &data[pbegin], 2); - top = get2byte(&data[hdr+5]); - put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2])); + top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]); + put2byte(&data[hdr+5], top); } + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + return SQLITE_OK; } /* @@ -894,7 +1310,7 @@ static int decodeFlags(MemPage *pPage, int flagByte){ assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 ); + pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); flagByte &= ~PTF_LEAF; pPage->childPtrSize = 4-4*pPage->leaf; pBt = pPage->pBt; @@ -917,110 +1333,115 @@ static int decodeFlags(MemPage *pPage, int flagByte){ /* ** Initialize the auxiliary information for a disk block. ** -** The pParent parameter must be a pointer to the MemPage which -** is the parent of the page being initialized. The root of a -** BTree has no parent and so for that page, pParent==NULL. -** ** Return SQLITE_OK on success. If we see that the page does ** not contain a well-formed database page, then return ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not ** guarantee that the page is well-formed. It only shows that ** we failed to detect any corruption. */ -int sqlite3BtreeInitPage( - MemPage *pPage, /* The page to be initialized */ - MemPage *pParent /* The parent. Might be NULL */ -){ - int pc; /* Address of a freeblock within pPage->aData[] */ - int hdr; /* Offset to beginning of page header */ - u8 *data; /* Equal to pPage->aData */ - BtShared *pBt; /* The main btree structure */ - int usableSize; /* Amount of usable space on each page */ - int cellOffset; /* Offset from start of page to first cell pointer */ - int nFree; /* Number of unused bytes on the page */ - int top; /* First byte of the cell content area */ +static int btreeInitPage(MemPage *pPage){ - pBt = pPage->pBt; - assert( pBt!=0 ); - assert( pParent==0 || pParent->pBt==pBt ); - assert( sqlite3_mutex_held(pBt->mutex) ); + assert( pPage->pBt!=0 ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); - if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){ - /* The parent page should never change unless the file is corrupt */ - return SQLITE_CORRUPT_BKPT; - } - if( pPage->isInit ) return SQLITE_OK; - if( pPage->pParent==0 && pParent!=0 ){ - pPage->pParent = pParent; - sqlite3PagerRef(pParent->pDbPage); - } - hdr = pPage->hdrOffset; - data = pPage->aData; - if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; - assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); - pPage->maskPage = pBt->pageSize - 1; - pPage->nOverflow = 0; - pPage->idxShift = 0; - usableSize = pBt->usableSize; - pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; - top = get2byte(&data[hdr+5]); - pPage->nCell = get2byte(&data[hdr+3]); - if( pPage->nCell>MX_CELL(pBt) ){ - /* To many cells for a single page. The page must be corrupt */ - return SQLITE_CORRUPT_BKPT; - } - if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){ - /* All pages must have at least one cell, except for root pages */ - return SQLITE_CORRUPT_BKPT; - } - /* Compute the total free space on the page */ - pc = get2byte(&data[hdr+1]); - nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell); - while( pc>0 ){ - int next, size; - if( pc>usableSize-4 ){ - /* Free block is off the page */ - return SQLITE_CORRUPT_BKPT; - } - next = get2byte(&data[pc]); - size = get2byte(&data[pc+2]); - if( next>0 && next<=pc+size+3 ){ - /* Free blocks must be in accending order */ - return SQLITE_CORRUPT_BKPT; - } - nFree += size; - pc = next; - } - pPage->nFree = nFree; - if( nFree>=usableSize ){ - /* Free space cannot exceed total page size */ - return SQLITE_CORRUPT_BKPT; - } + if( !pPage->isInit ){ + u16 pc; /* Address of a freeblock within pPage->aData[] */ + u8 hdr; /* Offset to beginning of page header */ + u8 *data; /* Equal to pPage->aData */ + BtShared *pBt; /* The main btree structure */ + u16 usableSize; /* Amount of usable space on each page */ + u16 cellOffset; /* Offset from start of page to first cell pointer */ + u16 nFree; /* Number of unused bytes on the page */ + u16 top; /* First byte of the cell content area */ + int iCellFirst; /* First allowable cell or freeblock offset */ + int iCellLast; /* Last possible cell or freeblock offset */ -#if 0 - /* Check that all the offsets in the cell offset array are within range. - ** - ** Omitting this consistency check and using the pPage->maskPage mask - ** to prevent overrunning the page buffer in findCell() results in a - ** 2.5% performance gain. - */ - { - u8 *pOff; /* Iterator used to check all cell offsets are in range */ - u8 *pEnd; /* Pointer to end of cell offset array */ - u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */ - mask = ~(((u8)(pBt->pageSize>>8))-1); - pEnd = &data[cellOffset + pPage->nCell*2]; - for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2); - if( pOff!=pEnd ){ + pBt = pPage->pBt; + + hdr = pPage->hdrOffset; + data = pPage->aData; + if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; + assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); + pPage->maskPage = pBt->pageSize - 1; + pPage->nOverflow = 0; + usableSize = pBt->usableSize; + pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; + top = get2byte(&data[hdr+5]); + pPage->nCell = get2byte(&data[hdr+3]); + if( pPage->nCell>MX_CELL(pBt) ){ + /* To many cells for a single page. The page must be corrupt */ return SQLITE_CORRUPT_BKPT; } - } + testcase( pPage->nCell==MX_CELL(pBt) ); + + /* A malformed database page might cause us to read past the end + ** of page when parsing a cell. + ** + ** The following block of code checks early to see if a cell extends + ** past the end of a page boundary and causes SQLITE_CORRUPT to be + ** returned if it does. + */ + iCellFirst = cellOffset + 2*pPage->nCell; + iCellLast = usableSize - 4; +#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) + { + int i; /* Index into the cell pointer array */ + int sz; /* Size of a cell */ + + if( !pPage->leaf ) iCellLast--; + for(i=0; i<pPage->nCell; i++){ + pc = get2byte(&data[cellOffset+i*2]); + testcase( pc==iCellFirst ); + testcase( pc==iCellLast ); + if( pc<iCellFirst || pc>iCellLast ){ + return SQLITE_CORRUPT_BKPT; + } + sz = cellSizePtr(pPage, &data[pc]); + testcase( pc+sz==usableSize ); + if( pc+sz>usableSize ){ + return SQLITE_CORRUPT_BKPT; + } + } + if( !pPage->leaf ) iCellLast++; + } #endif - pPage->isInit = 1; + /* Compute the total free space on the page */ + pc = get2byte(&data[hdr+1]); + nFree = data[hdr+7] + top; + while( pc>0 ){ + u16 next, size; + if( pc<iCellFirst || pc>iCellLast ){ + /* Start of free block is off the page */ + return SQLITE_CORRUPT_BKPT; + } + next = get2byte(&data[pc]); + size = get2byte(&data[pc+2]); + if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){ + /* Free blocks must be in ascending order. And the last byte of + ** the free-block must lie on the database page. */ + return SQLITE_CORRUPT_BKPT; + } + nFree = nFree + size; + pc = next; + } + + /* At this point, nFree contains the sum of the offset to the start + ** of the cell-content area plus the number of free bytes within + ** the cell-content area. If this is greater than the usable-size + ** of the page, then the page must be corrupted. This check also + ** serves to verify that the offset to the start of the cell-content + ** area, according to the page header, lies within the page. + */ + if( nFree>usableSize ){ + return SQLITE_CORRUPT_BKPT; + } + pPage->nFree = (u16)(nFree - iCellFirst); + pPage->isInit = 1; + } return SQLITE_OK; } @@ -1031,8 +1452,8 @@ int sqlite3BtreeInitPage( static void zeroPage(MemPage *pPage, int flags){ unsigned char *data = pPage->aData; BtShared *pBt = pPage->pBt; - int hdr = pPage->hdrOffset; - int first; + u8 hdr = pPage->hdrOffset; + u16 first; assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); @@ -1040,8 +1461,8 @@ static void zeroPage(MemPage *pPage, int flags){ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( sqlite3_mutex_held(pBt->mutex) ); /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/ - data[hdr] = flags; - first = hdr + 8 + 4*((flags&PTF_LEAF)==0); + data[hdr] = (char)flags; + first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0); memset(&data[hdr+1], 0, 4); data[hdr+7] = 0; put2byte(&data[hdr+5], pBt->usableSize); @@ -1052,11 +1473,25 @@ static void zeroPage(MemPage *pPage, int flags){ pPage->nOverflow = 0; assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); pPage->maskPage = pBt->pageSize - 1; - pPage->idxShift = 0; pPage->nCell = 0; pPage->isInit = 1; } + +/* +** Convert a DbPage obtained from the pager into a MemPage used by +** the btree layer. +*/ +static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ + MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); + pPage->aData = sqlite3PagerGetData(pDbPage); + pPage->pDbPage = pDbPage; + pPage->pBt = pBt; + pPage->pgno = pgno; + pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; + return pPage; +} + /* ** Get a page from the pager. Initialize the MemPage.pBt and ** MemPage.aData elements if needed. @@ -1068,62 +1503,93 @@ static void zeroPage(MemPage *pPage, int flags){ ** means we have started to be concerned about content and the disk ** read should occur at that point. */ -int sqlite3BtreeGetPage( +static int btreeGetPage( BtShared *pBt, /* The btree */ Pgno pgno, /* Number of the page to fetch */ MemPage **ppPage, /* Return the page in this parameter */ int noContent /* Do not load page content if true */ ){ int rc; - MemPage *pPage; DbPage *pDbPage; assert( sqlite3_mutex_held(pBt->mutex) ); rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); if( rc ) return rc; - pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage); - pPage->aData = sqlite3PagerGetData(pDbPage); - pPage->pDbPage = pDbPage; - pPage->pBt = pBt; - pPage->pgno = pgno; - pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; - *ppPage = pPage; + *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); return SQLITE_OK; } /* -** Get a page from the pager and initialize it. This routine -** is just a convenience wrapper around separate calls to -** sqlite3BtreeGetPage() and sqlite3BtreeInitPage(). +** Retrieve a page from the pager cache. If the requested page is not +** already in the pager cache return NULL. Initialize the MemPage.pBt and +** MemPage.aData elements if needed. +*/ +static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ + DbPage *pDbPage; + assert( sqlite3_mutex_held(pBt->mutex) ); + pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); + if( pDbPage ){ + return btreePageFromDbPage(pDbPage, pgno, pBt); + } + return 0; +} + +/* +** Return the size of the database file in pages. If there is any kind of +** error, return ((unsigned int)-1). +*/ +static Pgno pagerPagecount(BtShared *pBt){ + int nPage = -1; + int rc; + assert( pBt->pPage1 ); + rc = sqlite3PagerPagecount(pBt->pPager, &nPage); + assert( rc==SQLITE_OK || nPage==-1 ); + return (Pgno)nPage; +} + +/* +** Get a page from the pager and initialize it. This routine is just a +** convenience wrapper around separate calls to btreeGetPage() and +** btreeInitPage(). +** +** If an error occurs, then the value *ppPage is set to is undefined. It +** may remain unchanged, or it may be set to an invalid value. */ static int getAndInitPage( BtShared *pBt, /* The database file */ Pgno pgno, /* Number of the page to get */ - MemPage **ppPage, /* Write the page pointer here */ - MemPage *pParent /* Parent of the page */ + MemPage **ppPage /* Write the page pointer here */ ){ int rc; + TESTONLY( Pgno iLastPg = pagerPagecount(pBt); ) assert( sqlite3_mutex_held(pBt->mutex) ); - if( pgno==0 ){ - return SQLITE_CORRUPT_BKPT; - } - rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0); - if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){ - rc = sqlite3BtreeInitPage(*ppPage, pParent); + + rc = btreeGetPage(pBt, pgno, ppPage, 0); + if( rc==SQLITE_OK ){ + rc = btreeInitPage(*ppPage); if( rc!=SQLITE_OK ){ releasePage(*ppPage); - *ppPage = 0; } } + + /* If the requested page number was either 0 or greater than the page + ** number of the last page in the database, this function should return + ** SQLITE_CORRUPT or some other error (i.e. SQLITE_FULL). Check that this + ** is the case. */ + assert( (pgno>0 && pgno<=iLastPg) || rc!=SQLITE_OK ); + testcase( pgno==0 ); + testcase( pgno==iLastPg ); + return rc; } /* ** Release a MemPage. This should be called once for each prior -** call to sqlite3BtreeGetPage. +** call to btreeGetPage. */ static void releasePage(MemPage *pPage){ if( pPage ){ + assert( pPage->nOverflow==0 || sqlite3PagerPageRefcount(pPage->pDbPage)>1 ); assert( pPage->aData ); assert( pPage->pBt ); assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); @@ -1134,25 +1600,6 @@ static void releasePage(MemPage *pPage){ } /* -** This routine is called when the reference count for a page -** reaches zero. We need to unref the pParent pointer when that -** happens. -*/ -static void pageDestructor(DbPage *pData, int pageSize){ - MemPage *pPage; - assert( (pageSize & 7)==0 ); - pPage = (MemPage *)sqlite3PagerGetExtra(pData); - assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) ); - if( pPage->pParent ){ - MemPage *pParent = pPage->pParent; - assert( pParent->pBt==pPage->pBt ); - pPage->pParent = 0; - releasePage(pParent); - } - pPage->isInit = 0; -} - -/* ** During a rollback, when the pager reloads information into the cache ** so that the cache is restored to its original state at the start of ** the transaction, for each page restored this routine is called. @@ -1160,21 +1607,29 @@ static void pageDestructor(DbPage *pData, int pageSize){ ** This routine needs to reset the extra data section at the end of the ** page to agree with the restored data. */ -static void pageReinit(DbPage *pData, int pageSize){ +static void pageReinit(DbPage *pData){ MemPage *pPage; - assert( (pageSize & 7)==0 ); pPage = (MemPage *)sqlite3PagerGetExtra(pData); + assert( sqlite3PagerPageRefcount(pData)>0 ); if( pPage->isInit ){ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); pPage->isInit = 0; - sqlite3BtreeInitPage(pPage, pPage->pParent); + if( sqlite3PagerPageRefcount(pData)>1 ){ + /* pPage might not be a btree page; it might be an overflow page + ** or ptrmap page or a free page. In those cases, the following + ** call to btreeInitPage() will likely return SQLITE_CORRUPT. + ** But no harm is done by this. And it is very important that + ** btreeInitPage() be called on every btree page so we make + ** the call for every page that comes in for re-initing. */ + btreeInitPage(pPage); + } } } /* ** Invoke the busy handler for a btree. */ -static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){ +static int btreeInvokeBusyHandler(void *pArg){ BtShared *pBt = (BtShared*)pArg; assert( pBt->db ); assert( sqlite3_mutex_held(pBt->db->mutex) ); @@ -1189,6 +1644,12 @@ static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){ ** database file will be deleted when sqlite3BtreeClose() is called. ** If zFilename is ":memory:" then an in-memory database is created ** that is automatically destroyed when it is closed. +** +** If the database is already opened in the same database connection +** and we are in shared cache mode, then the open will fail with an +** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared +** objects in the same database connection since doing so will lead +** to problems with locking. */ int sqlite3BtreeOpen( const char *zFilename, /* Name of the file containing the BTree database */ @@ -1197,12 +1658,13 @@ int sqlite3BtreeOpen( int flags, /* Options */ int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ ){ - sqlite3_vfs *pVfs; /* The VFS to use for this btree */ - BtShared *pBt = 0; /* Shared part of btree structure */ - Btree *p; /* Handle to return */ - int rc = SQLITE_OK; - int nReserve; - unsigned char zDbHeader[100]; + sqlite3_vfs *pVfs; /* The VFS to use for this btree */ + BtShared *pBt = 0; /* Shared part of btree structure */ + Btree *p; /* Handle to return */ + sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ + int rc = SQLITE_OK; /* Result code from this function */ + u8 nReserve; /* Byte of unused space on each page */ + unsigned char zDbHeader[100]; /* Database header content */ /* Set the variable isMemdb to true for an in-memory database, or ** false for a file-based database. This symbol is only required if @@ -1227,33 +1689,46 @@ int sqlite3BtreeOpen( } p->inTrans = TRANS_NONE; p->db = db; +#ifndef SQLITE_OMIT_SHARED_CACHE + p->lock.pBtree = p; + p->lock.iTable = 1; +#endif #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) /* ** If this Btree is a candidate for shared cache, try to find an ** existing BtShared object that we can share with */ - if( isMemdb==0 - && (db->flags & SQLITE_Vtab)==0 - && zFilename && zFilename[0] - ){ - if( sqlite3SharedCacheEnabled ){ + if( isMemdb==0 && zFilename && zFilename[0] ){ + if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ int nFullPathname = pVfs->mxPathname+1; char *zFullPathname = sqlite3Malloc(nFullPathname); sqlite3_mutex *mutexShared; p->sharable = 1; - db->flags |= SQLITE_SharedCache; if( !zFullPathname ){ sqlite3_free(p); return SQLITE_NOMEM; } sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); + mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); + sqlite3_mutex_enter(mutexOpen); mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); sqlite3_mutex_enter(mutexShared); - for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){ + for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ assert( pBt->nRef>0 ); if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) && sqlite3PagerVfs(pBt->pPager)==pVfs ){ + int iDb; + for(iDb=db->nDb-1; iDb>=0; iDb--){ + Btree *pExisting = db->aDb[iDb].pBt; + if( pExisting && pExisting->pBt==pBt ){ + sqlite3_mutex_leave(mutexShared); + sqlite3_mutex_leave(mutexOpen); + sqlite3_free(zFullPathname); + sqlite3_free(p); + return SQLITE_CONSTRAINT; + } + } p->pBt = pBt; pBt->nRef++; break; @@ -1291,21 +1766,18 @@ int sqlite3BtreeOpen( rc = SQLITE_NOMEM; goto btree_open_out; } - pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler; - pBt->busyHdr.pArg = pBt; rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, - EXTRA_SIZE, flags, vfsFlags); + EXTRA_SIZE, flags, vfsFlags, pageReinit); if( rc==SQLITE_OK ){ rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); } if( rc!=SQLITE_OK ){ goto btree_open_out; } - sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr); + pBt->db = db; + sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt); p->pBt = pBt; - sqlite3PagerSetDestructor(pBt->pPager, pageDestructor); - sqlite3PagerSetReiniter(pBt->pPager, pageReinit); pBt->pCursor = 0; pBt->pPage1 = 0; pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); @@ -1313,7 +1785,6 @@ int sqlite3BtreeOpen( if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ pBt->pageSize = 0; - sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); #ifndef SQLITE_OMIT_AUTOVACUUM /* If the magic name ":memory:" will create an in-memory database, then ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if @@ -1335,9 +1806,10 @@ int sqlite3BtreeOpen( pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); #endif } + rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); + if( rc ) goto btree_open_out; pBt->usableSize = pBt->pageSize - nReserve; assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ - sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) /* Add the new BtShared object to the linked list sharable BtShareds. @@ -1346,7 +1818,7 @@ int sqlite3BtreeOpen( sqlite3_mutex *mutexShared; pBt->nRef = 1; mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); - if( SQLITE_THREADSAFE && sqlite3Config.bCoreMutex ){ + if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); if( pBt->mutex==0 ){ rc = SQLITE_NOMEM; @@ -1355,8 +1827,8 @@ int sqlite3BtreeOpen( } } sqlite3_mutex_enter(mutexShared); - pBt->pNext = sqlite3SharedCacheList; - sqlite3SharedCacheList = pBt; + pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); + GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; sqlite3_mutex_leave(mutexShared); } #endif @@ -1404,6 +1876,10 @@ btree_open_out: sqlite3_free(p); *ppBtree = 0; } + if( mutexOpen ){ + assert( sqlite3_mutex_held(mutexOpen) ); + sqlite3_mutex_leave(mutexOpen); + } return rc; } @@ -1424,10 +1900,10 @@ static int removeFromSharingList(BtShared *pBt){ sqlite3_mutex_enter(pMaster); pBt->nRef--; if( pBt->nRef<=0 ){ - if( sqlite3SharedCacheList==pBt ){ - sqlite3SharedCacheList = pBt->pNext; + if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ + GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; }else{ - pList = sqlite3SharedCacheList; + pList = GLOBAL(BtShared*,sqlite3SharedCacheList); while( ALWAYS(pList) && pList->pNext!=pBt ){ pList=pList->pNext; } @@ -1475,7 +1951,6 @@ int sqlite3BtreeClose(Btree *p){ /* Close all cursors opened via this handle. */ assert( sqlite3_mutex_held(p->db->mutex) ); sqlite3BtreeEnter(p); - pBt->db = p->db; pCur = pBt->pCursor; while( pCur ){ BtCursor *pTmp = pCur; @@ -1585,6 +2060,8 @@ int sqlite3BtreeSyncDisabled(Btree *p){ #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) /* ** Change the default pages size and the number of reserved bytes per page. +** Or, if the page size has already been fixed, return SQLITE_READONLY +** without changing anything. ** ** The page size must be a power of 2 between 512 and 65536. If the page ** size supplied does not meet this constraint then the page size is not @@ -1597,10 +2074,14 @@ int sqlite3BtreeSyncDisabled(Btree *p){ ** ** If parameter nReserve is less than zero, then the number of reserved ** bytes per page is left unchanged. +** +** If the iFix!=0 then the pageSizeFixed flag is set so that the page size +** and autovacuum mode can no longer be changed. */ -int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ +int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ int rc = SQLITE_OK; BtShared *pBt = p->pBt; + assert( nReserve>=-1 && nReserve<=255 ); sqlite3BtreeEnter(p); if( pBt->pageSizeFixed ){ sqlite3BtreeLeave(p); @@ -1609,15 +2090,17 @@ int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ if( nReserve<0 ){ nReserve = pBt->pageSize - pBt->usableSize; } + assert( nReserve>=0 && nReserve<=255 ); if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && ((pageSize-1)&pageSize)==0 ){ assert( (pageSize & 7)==0 ); assert( !pBt->pPage1 && !pBt->pCursor ); - pBt->pageSize = pageSize; + pBt->pageSize = (u16)pageSize; freeTempSpace(pBt); - rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); } - pBt->usableSize = pBt->pageSize - nReserve; + rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); + pBt->usableSize = pBt->pageSize - (u16)nReserve; + if( iFix ) pBt->pageSizeFixed = 1; sqlite3BtreeLeave(p); return rc; } @@ -1628,6 +2111,12 @@ int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){ int sqlite3BtreeGetPageSize(Btree *p){ return p->pBt->pageSize; } + +/* +** Return the number of bytes of space at the end of every page that +** are intentually left unused. This is the "reserved" space that is +** sometimes used by extensions. +*/ int sqlite3BtreeGetReserve(Btree *p){ int n; sqlite3BtreeEnter(p); @@ -1662,13 +2151,14 @@ int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ #else BtShared *pBt = p->pBt; int rc = SQLITE_OK; - int av = (autoVacuum?1:0); + u8 av = (u8)autoVacuum; sqlite3BtreeEnter(p); - if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){ + if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){ rc = SQLITE_READONLY; }else{ - pBt->autoVacuum = av; + pBt->autoVacuum = av ?1:0; + pBt->incrVacuum = av==2 ?1:0; } sqlite3BtreeLeave(p); return rc; @@ -1711,8 +2201,10 @@ static int lockBtree(BtShared *pBt){ int nPage; assert( sqlite3_mutex_held(pBt->mutex) ); - if( pBt->pPage1 ) return SQLITE_OK; - rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0); + assert( pBt->pPage1==0 ); + rc = sqlite3PagerSharedLock(pBt->pPager); + if( rc!=SQLITE_OK ) return rc; + rc = btreeGetPage(pBt, 1, &pPage1, 0); if( rc!=SQLITE_OK ) return rc; /* Do some checking to help insure the file we opened really is @@ -1760,17 +2252,18 @@ static int lockBtree(BtShared *pBt){ ** again with the correct page-size. */ releasePage(pPage1); - pBt->usableSize = usableSize; - pBt->pageSize = pageSize; + pBt->usableSize = (u16)usableSize; + pBt->pageSize = (u16)pageSize; freeTempSpace(pBt); - sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize); - return SQLITE_OK; + rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, + pageSize-usableSize); + return rc; } - if( usableSize<500 ){ + if( usableSize<480 ){ goto page1_init_failed; } - pBt->pageSize = pageSize; - pBt->usableSize = usableSize; + pBt->pageSize = (u16)pageSize; + pBt->usableSize = (u16)usableSize; #ifndef SQLITE_OMIT_AUTOVACUUM pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); @@ -1805,61 +2298,29 @@ page1_init_failed: } /* -** This routine works like lockBtree() except that it also invokes the -** busy callback if there is lock contention. -*/ -static int lockBtreeWithRetry(Btree *pRef){ - int rc = SQLITE_OK; - - assert( sqlite3BtreeHoldsMutex(pRef) ); - if( pRef->inTrans==TRANS_NONE ){ - u8 inTransaction = pRef->pBt->inTransaction; - btreeIntegrity(pRef); - rc = sqlite3BtreeBeginTrans(pRef, 0); - pRef->pBt->inTransaction = inTransaction; - pRef->inTrans = TRANS_NONE; - if( rc==SQLITE_OK ){ - pRef->pBt->nTransaction--; - } - btreeIntegrity(pRef); - } - return rc; -} - - -/* ** If there are no outstanding cursors and we are not in the middle ** of a transaction but there is a read lock on the database, then ** this routine unrefs the first page of the database file which ** has the effect of releasing the read lock. ** -** If there are any outstanding cursors, this routine is a no-op. -** ** If there is a transaction in progress, this routine is a no-op. */ static void unlockBtreeIfUnused(BtShared *pBt){ assert( sqlite3_mutex_held(pBt->mutex) ); - if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){ - if( sqlite3PagerRefcount(pBt->pPager)>=1 ){ - assert( pBt->pPage1->aData ); -#if 0 - if( pBt->pPage1->aData==0 ){ - MemPage *pPage = pBt->pPage1; - pPage->aData = sqlite3PagerGetData(pPage->pDbPage); - pPage->pBt = pBt; - pPage->pgno = 1; - } -#endif - releasePage(pBt->pPage1); - } + assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE ); + if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ + assert( pBt->pPage1->aData ); + assert( sqlite3PagerRefcount(pBt->pPager)==1 ); + assert( pBt->pPage1->aData ); + releasePage(pBt->pPage1); pBt->pPage1 = 0; - pBt->inStmt = 0; } } /* -** Create a new database by initializing the first page of the -** file. +** If pBt points to an empty file then convert that empty file +** into a new empty database by initializing the first page of +** the database. */ static int newDatabase(BtShared *pBt){ MemPage *pP1; @@ -1868,8 +2329,11 @@ static int newDatabase(BtShared *pBt){ int nPage; assert( sqlite3_mutex_held(pBt->mutex) ); + /* The database size has already been measured and cached, so failure + ** is impossible here. If the original size measurement failed, then + ** processing aborts before entering this routine. */ rc = sqlite3PagerPagecount(pBt->pPager, &nPage); - if( rc!=SQLITE_OK || nPage>0 ){ + if( NEVER(rc!=SQLITE_OK) || nPage>0 ){ return rc; } pP1 = pBt->pPage1; @@ -1883,7 +2347,8 @@ static int newDatabase(BtShared *pBt){ put2byte(&data[16], pBt->pageSize); data[18] = 1; data[19] = 1; - data[20] = pBt->pageSize - pBt->usableSize; + assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); + data[20] = (u8)(pBt->pageSize - pBt->usableSize); data[21] = 64; data[22] = 32; data[23] = 32; @@ -1935,11 +2400,11 @@ static int newDatabase(BtShared *pBt){ ** proceed. */ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ + sqlite3 *pBlock = 0; BtShared *pBt = p->pBt; int rc = SQLITE_OK; sqlite3BtreeEnter(p); - pBt->db = p->db; btreeIntegrity(p); /* If the btree is already in a write-transaction, or it @@ -1956,88 +2421,102 @@ int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ goto trans_begun; } +#ifndef SQLITE_OMIT_SHARED_CACHE /* If another database handle has already opened a write transaction ** on this shared-btree structure and a second write transaction is - ** requested, return SQLITE_BUSY. + ** requested, return SQLITE_LOCKED. */ - if( pBt->inTransaction==TRANS_WRITE && wrflag ){ - rc = SQLITE_BUSY; - goto trans_begun; - } - -#ifndef SQLITE_OMIT_SHARED_CACHE - if( wrflag>1 ){ + if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){ + pBlock = pBt->pWriter->db; + }else if( wrflag>1 ){ BtLock *pIter; for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ if( pIter->pBtree!=p ){ - rc = SQLITE_BUSY; - goto trans_begun; + pBlock = pIter->pBtree->db; + break; } } } + if( pBlock ){ + sqlite3ConnectionBlocked(p->db, pBlock); + rc = SQLITE_LOCKED_SHAREDCACHE; + goto trans_begun; + } #endif + /* Any read-only or read-write transaction implies a read-lock on + ** page 1. So if some other shared-cache client already has a write-lock + ** on page 1, the transaction cannot be opened. */ + rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); + if( SQLITE_OK!=rc ) goto trans_begun; + do { - if( pBt->pPage1==0 ){ - do{ - rc = lockBtree(pBt); - }while( pBt->pPage1==0 && rc==SQLITE_OK ); - } + /* Call lockBtree() until either pBt->pPage1 is populated or + ** lockBtree() returns something other than SQLITE_OK. lockBtree() + ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after + ** reading page 1 it discovers that the page-size of the database + ** file is not pBt->pageSize. In this case lockBtree() will update + ** pBt->pageSize to the page-size of the file on disk. + */ + while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); if( rc==SQLITE_OK && wrflag ){ if( pBt->readOnly ){ rc = SQLITE_READONLY; }else{ - rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1); + rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db)); if( rc==SQLITE_OK ){ rc = newDatabase(pBt); } } } - if( rc==SQLITE_OK ){ - if( wrflag ) pBt->inStmt = 0; - }else{ + if( rc!=SQLITE_OK ){ unlockBtreeIfUnused(pBt); } }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && - sqlite3BtreeInvokeBusyHandler(pBt, 0) ); + btreeInvokeBusyHandler(pBt) ); if( rc==SQLITE_OK ){ if( p->inTrans==TRANS_NONE ){ pBt->nTransaction++; +#ifndef SQLITE_OMIT_SHARED_CACHE + if( p->sharable ){ + assert( p->lock.pBtree==p && p->lock.iTable==1 ); + p->lock.eLock = READ_LOCK; + p->lock.pNext = pBt->pLock; + pBt->pLock = &p->lock; + } +#endif } p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); if( p->inTrans>pBt->inTransaction ){ pBt->inTransaction = p->inTrans; } #ifndef SQLITE_OMIT_SHARED_CACHE - if( wrflag>1 ){ - assert( !pBt->pExclusive ); - pBt->pExclusive = p; + if( wrflag ){ + assert( !pBt->pWriter ); + pBt->pWriter = p; + pBt->isExclusive = (u8)(wrflag>1); } #endif } trans_begun: + if( rc==SQLITE_OK && wrflag ){ + /* This call makes sure that the pager has the correct number of + ** open savepoints. If the second parameter is greater than 0 and + ** the sub-journal is not already open, then it will be opened here. + */ + rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); + } + btreeIntegrity(p); sqlite3BtreeLeave(p); return rc; } -/* -** Return the size of the database file in pages. Or return -1 if -** there is any kind of error. -*/ -static int pagerPagecount(Pager *pPager){ - int rc; - int nPage; - rc = sqlite3PagerPagecount(pPager, &nPage); - return (rc==SQLITE_OK?nPage:-1); -} - - #ifndef SQLITE_OMIT_AUTOVACUUM /* @@ -2050,11 +2529,11 @@ static int setChildPtrmaps(MemPage *pPage){ int nCell; /* Number of cells in page pPage */ int rc; /* Return code */ BtShared *pBt = pPage->pBt; - int isInitOrig = pPage->isInit; + u8 isInitOrig = pPage->isInit; Pgno pgno = pPage->pgno; assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - rc = sqlite3BtreeInitPage(pPage, pPage->pParent); + rc = btreeInitPage(pPage); if( rc!=SQLITE_OK ){ goto set_child_ptrmaps_out; } @@ -2063,21 +2542,17 @@ static int setChildPtrmaps(MemPage *pPage){ for(i=0; i<nCell; i++){ u8 *pCell = findCell(pPage, i); - rc = ptrmapPutOvflPtr(pPage, pCell); - if( rc!=SQLITE_OK ){ - goto set_child_ptrmaps_out; - } + ptrmapPutOvflPtr(pPage, pCell, &rc); if( !pPage->leaf ){ Pgno childPgno = get4byte(pCell); - rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); - if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out; + ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); } } if( !pPage->leaf ){ Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); - rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno); + ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); } set_child_ptrmaps_out: @@ -2086,10 +2561,9 @@ set_child_ptrmaps_out: } /* -** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow -** page, is a pointer to page iFrom. Modify this pointer so that it points to -** iTo. Parameter eType describes the type of pointer to be modified, as -** follows: +** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so +** that it points to iTo. Parameter eType describes the type of pointer to +** be modified, as follows: ** ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child ** page of pPage. @@ -2102,6 +2576,7 @@ set_child_ptrmaps_out: */ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); if( eType==PTRMAP_OVERFLOW2 ){ /* The pointer is always the first 4 bytes of the page in this case. */ if( get4byte(pPage->aData)!=iFrom ){ @@ -2109,18 +2584,18 @@ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ } put4byte(pPage->aData, iTo); }else{ - int isInitOrig = pPage->isInit; + u8 isInitOrig = pPage->isInit; int i; int nCell; - sqlite3BtreeInitPage(pPage, 0); + btreeInitPage(pPage); nCell = pPage->nCell; for(i=0; i<nCell; i++){ u8 *pCell = findCell(pPage, i); if( eType==PTRMAP_OVERFLOW1 ){ CellInfo info; - sqlite3BtreeParseCellPtr(pPage, pCell, &info); + btreeParseCellPtr(pPage, pCell, &info); if( info.iOverflow ){ if( iFrom==get4byte(&pCell[info.iOverflow]) ){ put4byte(&pCell[info.iOverflow], iTo); @@ -2152,6 +2627,11 @@ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ /* ** Move the open database page pDbPage to location iFreePage in the ** database. The pDbPage reference remains valid. +** +** The isCommit flag indicates that there is no need to remember that +** the journal needs to be sync()ed before database page pDbPage->pgno +** can be written to. The caller has already promised not to write to that +** page. */ static int relocatePage( BtShared *pBt, /* Btree */ @@ -2159,7 +2639,7 @@ static int relocatePage( u8 eType, /* Pointer map 'type' entry for pDbPage */ Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ Pgno iFreePage, /* The location to move pDbPage to */ - int isCommit + int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ ){ MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ Pgno iDbPage = pDbPage->pgno; @@ -2196,7 +2676,7 @@ static int relocatePage( }else{ Pgno nextOvfl = get4byte(pDbPage->aData); if( nextOvfl!=0 ){ - rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage); + ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); if( rc!=SQLITE_OK ){ return rc; } @@ -2208,7 +2688,7 @@ static int relocatePage( ** iPtrPage. */ if( eType!=PTRMAP_ROOTPAGE ){ - rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0); + rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -2220,7 +2700,7 @@ static int relocatePage( rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); releasePage(pPtrPage); if( rc==SQLITE_OK ){ - rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage); + ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); } } return rc; @@ -2238,21 +2718,20 @@ static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); ** database so that the last page of the file currently in use ** is no longer in use. ** -** If the nFin parameter is non-zero, the implementation assumes +** If the nFin parameter is non-zero, this function assumes ** that the caller will keep calling incrVacuumStep() until ** it returns SQLITE_DONE or an error, and that nFin is the ** number of pages the database file will contain after this -** process is complete. +** process is complete. If nFin is zero, it is assumed that +** incrVacuumStep() will be called a finite amount of times +** which may or may not empty the freelist. A full autovacuum +** has nFin>0. A "PRAGMA incremental_vacuum" has nFin==0. */ -static int incrVacuumStep(BtShared *pBt, Pgno nFin){ - Pgno iLastPg; /* Last page in the database */ +static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){ Pgno nFreeList; /* Number of pages still on the free-list */ assert( sqlite3_mutex_held(pBt->mutex) ); - iLastPg = pBt->nTrunc; - if( iLastPg==0 ){ - iLastPg = pagerPagecount(pBt->pPager); - } + assert( iLastPg>nFin ); if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ int rc; @@ -2260,7 +2739,7 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin){ Pgno iPtrPage; nFreeList = get4byte(&pBt->pPage1->aData[36]); - if( nFreeList==0 || nFin==iLastPg ){ + if( nFreeList==0 ){ return SQLITE_DONE; } @@ -2292,7 +2771,7 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin){ Pgno iFreePg; /* Index of free page to move pLastPg to */ MemPage *pLastPg; - rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0); + rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -2326,9 +2805,24 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin){ } } - pBt->nTrunc = iLastPg - 1; - while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){ - pBt->nTrunc--; + if( nFin==0 ){ + iLastPg--; + while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){ + if( PTRMAP_ISPAGE(pBt, iLastPg) ){ + MemPage *pPg; + int rc = btreeGetPage(pBt, iLastPg, &pPg, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + rc = sqlite3PagerWrite(pPg->pDbPage); + releasePage(pPg); + if( rc!=SQLITE_OK ){ + return rc; + } + } + iLastPg--; + } + sqlite3PagerTruncateImage(pBt->pPager, iLastPg); } return SQLITE_OK; } @@ -2338,7 +2832,7 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin){ ** It performs a single unit of work towards an incremental vacuum. ** ** If the incremental vacuum is finished after this function has run, -** SQLITE_DONE is returned. If it is not finished, but no error occured, +** SQLITE_DONE is returned. If it is not finished, but no error occurred, ** SQLITE_OK is returned. Otherwise an SQLite error code. */ int sqlite3BtreeIncrVacuum(Btree *p){ @@ -2346,13 +2840,12 @@ int sqlite3BtreeIncrVacuum(Btree *p){ BtShared *pBt = p->pBt; sqlite3BtreeEnter(p); - pBt->db = p->db; assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); if( !pBt->autoVacuum ){ rc = SQLITE_DONE; }else{ invalidateAllOverflowCache(pBt); - rc = incrVacuumStep(pBt, 0); + rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt)); } sqlite3BtreeLeave(p); return rc; @@ -2367,68 +2860,64 @@ int sqlite3BtreeIncrVacuum(Btree *p){ ** i.e. the database has been reorganized so that only the first *pnTrunc ** pages are in use. */ -static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){ +static int autoVacuumCommit(BtShared *pBt){ int rc = SQLITE_OK; Pager *pPager = pBt->pPager; -#ifndef NDEBUG - int nRef = sqlite3PagerRefcount(pPager); -#endif + VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) ); assert( sqlite3_mutex_held(pBt->mutex) ); invalidateAllOverflowCache(pBt); assert(pBt->autoVacuum); if( !pBt->incrVacuum ){ - Pgno nFin = 0; - - if( pBt->nTrunc==0 ){ - Pgno nFree; - Pgno nPtrmap; - const int pgsz = pBt->pageSize; - int nOrig = pagerPagecount(pBt->pPager); + Pgno nFin; /* Number of pages in database after autovacuuming */ + Pgno nFree; /* Number of pages on the freelist initially */ + Pgno nPtrmap; /* Number of PtrMap pages to be freed */ + Pgno iFree; /* The next page to be freed */ + int nEntry; /* Number of entries on one ptrmap page */ + Pgno nOrig; /* Database size before freeing */ + + nOrig = pagerPagecount(pBt); + if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ + /* It is not possible to create a database for which the final page + ** is either a pointer-map page or the pending-byte page. If one + ** is encountered, this indicates corruption. + */ + return SQLITE_CORRUPT_BKPT; + } - if( PTRMAP_ISPAGE(pBt, nOrig) ){ - return SQLITE_CORRUPT_BKPT; - } - if( nOrig==PENDING_BYTE_PAGE(pBt) ){ - nOrig--; - } - nFree = get4byte(&pBt->pPage1->aData[36]); - nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5); - nFin = nOrig - nFree - nPtrmap; - if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){ - nFin--; - } - while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ - nFin--; - } + nFree = get4byte(&pBt->pPage1->aData[36]); + nEntry = pBt->usableSize/5; + nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; + nFin = nOrig - nFree - nPtrmap; + if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ + nFin--; + } + while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ + nFin--; } + if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; - while( rc==SQLITE_OK ){ - rc = incrVacuumStep(pBt, nFin); + for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ + rc = incrVacuumStep(pBt, nFin, iFree); } - if( rc==SQLITE_DONE ){ - assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc); + if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ rc = SQLITE_OK; - if( pBt->nTrunc && nFin ){ - rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); - put4byte(&pBt->pPage1->aData[32], 0); - put4byte(&pBt->pPage1->aData[36], 0); - pBt->nTrunc = nFin; - } + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); + put4byte(&pBt->pPage1->aData[32], 0); + put4byte(&pBt->pPage1->aData[36], 0); + sqlite3PagerTruncateImage(pBt->pPager, nFin); } if( rc!=SQLITE_OK ){ sqlite3PagerRollback(pPager); } } - if( rc==SQLITE_OK ){ - *pnTrunc = pBt->nTrunc; - pBt->nTrunc = 0; - } assert( nRef==sqlite3PagerRefcount(pPager) ); return rc; } +#else /* ifndef SQLITE_OMIT_AUTOVACUUM */ +# define setChildPtrmaps(x) SQLITE_OK #endif /* @@ -2441,7 +2930,7 @@ static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){ ** database are written into the database file and flushed to oxide. ** At the end of this call, the rollback journal still exists on the ** disk and we are still holding all locks, so the transaction has not -** committed. See sqlite3BtreeCommit() for the second phase of the +** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the ** commit process. ** ** This call is a no-op if no write-transaction is currently active on pBt. @@ -2461,34 +2950,75 @@ int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ int rc = SQLITE_OK; if( p->inTrans==TRANS_WRITE ){ BtShared *pBt = p->pBt; - Pgno nTrunc = 0; sqlite3BtreeEnter(p); - pBt->db = p->db; #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ - rc = autoVacuumCommit(pBt, &nTrunc); + rc = autoVacuumCommit(pBt); if( rc!=SQLITE_OK ){ sqlite3BtreeLeave(p); return rc; } } #endif - rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0); + rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0); sqlite3BtreeLeave(p); } return rc; } /* +** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() +** at the conclusion of a transaction. +*/ +static void btreeEndTransaction(Btree *p){ + BtShared *pBt = p->pBt; + BtCursor *pCsr; + assert( sqlite3BtreeHoldsMutex(p) ); + + /* Search for a cursor held open by this b-tree connection. If one exists, + ** then the transaction will be downgraded to a read-only transaction + ** instead of actually concluded. A subsequent call to CommitPhaseTwo() + ** or Rollback() will finish the transaction and unlock the database. */ + for(pCsr=pBt->pCursor; pCsr && pCsr->pBtree!=p; pCsr=pCsr->pNext); + assert( pCsr==0 || p->inTrans>TRANS_NONE ); + + btreeClearHasContent(pBt); + if( pCsr ){ + downgradeAllSharedCacheTableLocks(p); + p->inTrans = TRANS_READ; + }else{ + /* If the handle had any kind of transaction open, decrement the + ** transaction count of the shared btree. If the transaction count + ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() + ** call below will unlock the pager. */ + if( p->inTrans!=TRANS_NONE ){ + clearAllSharedCacheTableLocks(p); + pBt->nTransaction--; + if( 0==pBt->nTransaction ){ + pBt->inTransaction = TRANS_NONE; + } + } + + /* Set the current transaction state to TRANS_NONE and unlock the + ** pager if this call closed the only read or write transaction. */ + p->inTrans = TRANS_NONE; + unlockBtreeIfUnused(pBt); + } + + btreeIntegrity(p); +} + +/* ** Commit the transaction currently in progress. ** ** This routine implements the second phase of a 2-phase commit. The -** sqlite3BtreeSync() routine does the first phase and should be invoked -** prior to calling this routine. The sqlite3BtreeSync() routine did -** all the work of writing information out to disk and flushing the +** sqlite3BtreeCommitPhaseOne() routine does the first phase and should +** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() +** routine did all the work of writing information out to disk and flushing the ** contents so that they are written onto the disk platter. All this -** routine has to do is delete or truncate the rollback journal -** (which causes the transaction to commit) and drop locks. +** routine has to do is delete or truncate or zero the header in the +** the rollback journal (which causes the transaction to commit) and +** drop locks. ** ** This will release the write lock on the database file. If there ** are no active cursors, it also releases the read lock. @@ -2497,7 +3027,6 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p){ BtShared *pBt = p->pBt; sqlite3BtreeEnter(p); - pBt->db = p->db; btreeIntegrity(p); /* If the handle has a write-transaction open, commit the shared-btrees @@ -2513,29 +3042,9 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p){ return rc; } pBt->inTransaction = TRANS_READ; - pBt->inStmt = 0; - } - unlockAllTables(p); - - /* If the handle has any kind of transaction open, decrement the transaction - ** count of the shared btree. If the transaction count reaches 0, set - ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below - ** will unlock the pager. - */ - if( p->inTrans!=TRANS_NONE ){ - pBt->nTransaction--; - if( 0==pBt->nTransaction ){ - pBt->inTransaction = TRANS_NONE; - } } - /* Set the handles current transaction state to TRANS_NONE and unlock - ** the pager if this call closed the only read or write transaction. - */ - p->inTrans = TRANS_NONE; - unlockBtreeIfUnused(pBt); - - btreeIntegrity(p); + btreeEndTransaction(p); sqlite3BtreeLeave(p); return SQLITE_OK; } @@ -2596,9 +3105,14 @@ void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ BtCursor *p; sqlite3BtreeEnter(pBtree); for(p=pBtree->pBt->pCursor; p; p=p->pNext){ - clearCursorPosition(p); + int i; + sqlite3BtreeClearCursor(p); p->eState = CURSOR_FAULT; - p->skip = errCode; + p->skipNext = errCode; + for(i=0; i<=p->iPage; i++){ + releasePage(p->apPage[i]); + p->apPage[i] = 0; + } } sqlite3BtreeLeave(pBtree); } @@ -2618,11 +3132,10 @@ int sqlite3BtreeRollback(Btree *p){ MemPage *pPage1; sqlite3BtreeEnter(p); - pBt->db = p->db; rc = saveAllCursors(pBt, 0, 0); #ifndef SQLITE_OMIT_SHARED_CACHE if( rc!=SQLITE_OK ){ - /* This is a horrible situation. An IO or malloc() error occured whilst + /* This is a horrible situation. An IO or malloc() error occurred whilst ** trying to save cursor positions. If this is an automatic rollback (as ** the result of a constraint, malloc() failure or IO error) then ** the cache may be internally inconsistent (not contain valid trees) so @@ -2633,15 +3146,10 @@ int sqlite3BtreeRollback(Btree *p){ } #endif btreeIntegrity(p); - unlockAllTables(p); if( p->inTrans==TRANS_WRITE ){ int rc2; -#ifndef SQLITE_OMIT_AUTOVACUUM - pBt->nTrunc = 0; -#endif - assert( TRANS_WRITE==pBt->inTransaction ); rc2 = sqlite3PagerRollback(pBt->pPager); if( rc2!=SQLITE_OK ){ @@ -2649,108 +3157,95 @@ int sqlite3BtreeRollback(Btree *p){ } /* The rollback may have destroyed the pPage1->aData value. So - ** call sqlite3BtreeGetPage() on page 1 again to make + ** call btreeGetPage() on page 1 again to make ** sure pPage1->aData is set correctly. */ - if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ + if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ releasePage(pPage1); } assert( countWriteCursors(pBt)==0 ); pBt->inTransaction = TRANS_READ; } - if( p->inTrans!=TRANS_NONE ){ - assert( pBt->nTransaction>0 ); - pBt->nTransaction--; - if( 0==pBt->nTransaction ){ - pBt->inTransaction = TRANS_NONE; - } - } - - p->inTrans = TRANS_NONE; - pBt->inStmt = 0; - unlockBtreeIfUnused(pBt); - - btreeIntegrity(p); + btreeEndTransaction(p); sqlite3BtreeLeave(p); return rc; } /* -** Start a statement subtransaction. The subtransaction can -** can be rolled back independently of the main transaction. -** You must start a transaction before starting a subtransaction. -** The subtransaction is ended automatically if the main transaction -** commits or rolls back. -** -** Only one subtransaction may be active at a time. It is an error to try -** to start a new subtransaction if another subtransaction is already active. +** Start a statement subtransaction. The subtransaction can can be rolled +** back independently of the main transaction. You must start a transaction +** before starting a subtransaction. The subtransaction is ended automatically +** if the main transaction commits or rolls back. ** ** Statement subtransactions are used around individual SQL statements ** that are contained within a BEGIN...COMMIT block. If a constraint ** error occurs within the statement, the effect of that one statement ** can be rolled back without having to rollback the entire transaction. +** +** A statement sub-transaction is implemented as an anonymous savepoint. The +** value passed as the second parameter is the total number of savepoints, +** including the new anonymous savepoint, open on the B-Tree. i.e. if there +** are no active savepoints and no other statement-transactions open, +** iStatement is 1. This anonymous savepoint can be released or rolled back +** using the sqlite3BtreeSavepoint() function. */ -int sqlite3BtreeBeginStmt(Btree *p){ +int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ int rc; BtShared *pBt = p->pBt; sqlite3BtreeEnter(p); - pBt->db = p->db; - if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){ - rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; + assert( p->inTrans==TRANS_WRITE ); + assert( pBt->readOnly==0 ); + assert( iStatement>0 ); + assert( iStatement>p->db->nSavepoint ); + if( NEVER(p->inTrans!=TRANS_WRITE || pBt->readOnly) ){ + rc = SQLITE_INTERNAL; }else{ assert( pBt->inTransaction==TRANS_WRITE ); - rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager); - pBt->inStmt = 1; - } - sqlite3BtreeLeave(p); - return rc; -} - - -/* -** Commit the statment subtransaction currently in progress. If no -** subtransaction is active, this is a no-op. -*/ -int sqlite3BtreeCommitStmt(Btree *p){ - int rc; - BtShared *pBt = p->pBt; - sqlite3BtreeEnter(p); - pBt->db = p->db; - if( pBt->inStmt && !pBt->readOnly ){ - rc = sqlite3PagerStmtCommit(pBt->pPager); - }else{ - rc = SQLITE_OK; + /* At the pager level, a statement transaction is a savepoint with + ** an index greater than all savepoints created explicitly using + ** SQL statements. It is illegal to open, release or rollback any + ** such savepoints while the statement transaction savepoint is active. + */ + rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); } - pBt->inStmt = 0; sqlite3BtreeLeave(p); return rc; } /* -** Rollback the active statement subtransaction. If no subtransaction -** is active this routine is a no-op. +** The second argument to this function, op, is always SAVEPOINT_ROLLBACK +** or SAVEPOINT_RELEASE. This function either releases or rolls back the +** savepoint identified by parameter iSavepoint, depending on the value +** of op. ** -** All cursors will be invalidated by this operation. Any attempt -** to use a cursor that was open at the beginning of this operation -** will result in an error. +** Normally, iSavepoint is greater than or equal to zero. However, if op is +** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the +** contents of the entire transaction are rolled back. This is different +** from a normal transaction rollback, as no locks are released and the +** transaction remains open. */ -int sqlite3BtreeRollbackStmt(Btree *p){ +int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ int rc = SQLITE_OK; - BtShared *pBt = p->pBt; - sqlite3BtreeEnter(p); - pBt->db = p->db; - if( pBt->inStmt && !pBt->readOnly ){ - rc = sqlite3PagerStmtRollback(pBt->pPager); - pBt->inStmt = 0; + if( p && p->inTrans==TRANS_WRITE ){ + BtShared *pBt = p->pBt; + assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); + assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); + sqlite3BtreeEnter(p); + rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); + if( rc==SQLITE_OK ){ + rc = newDatabase(pBt); + } + sqlite3BtreeLeave(p); } - sqlite3BtreeLeave(p); return rc; } /* ** Create a new cursor for the BTree whose root is on the page -** iTable. The act of acquiring a cursor gets a read lock on -** the database file. +** iTable. If a read-only cursor is requested, it is assumed that +** the caller already has at least a read-only transaction open +** on the database already. If a write-cursor is requested, then +** the caller is assumed to have an open write transaction. ** ** If wrFlag==0, then the cursor can only be used for reading. ** If wrFlag==1, then the cursor can be used for reading or for @@ -2773,6 +3268,9 @@ int sqlite3BtreeRollbackStmt(Btree *p){ ** No checking is done to make sure that page iTable really is the ** root page of a b-tree. If it is not, then the cursor acquired ** will not work correctly. +** +** It is assumed that the sqlite3BtreeCursorSize() bytes of memory +** pointed to by pCur have been zeroed by the caller. */ static int btreeCursor( Btree *p, /* The btree */ @@ -2781,59 +3279,46 @@ static int btreeCursor( struct KeyInfo *pKeyInfo, /* First arg to comparison function */ BtCursor *pCur /* Space for new cursor */ ){ - int rc; - BtShared *pBt = p->pBt; + BtShared *pBt = p->pBt; /* Shared b-tree handle */ assert( sqlite3BtreeHoldsMutex(p) ); - if( wrFlag ){ - if( pBt->readOnly ){ - return SQLITE_READONLY; - } - if( checkReadLocks(p, iTable, 0, 0) ){ - return SQLITE_LOCKED; - } - } + assert( wrFlag==0 || wrFlag==1 ); - if( pBt->pPage1==0 ){ - rc = lockBtreeWithRetry(p); - if( rc!=SQLITE_OK ){ - return rc; - } - if( pBt->readOnly && wrFlag ){ - return SQLITE_READONLY; - } - } - pCur->pgnoRoot = (Pgno)iTable; - if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){ - rc = SQLITE_EMPTY; - goto create_cursor_exception; + /* The following assert statements verify that if this is a sharable + ** b-tree database, the connection is holding the required table locks, + ** and that no other connection has any open cursor that conflicts with + ** this lock. */ + assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) ); + assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); + + /* Assert that the caller has opened the required transaction. */ + assert( p->inTrans>TRANS_NONE ); + assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); + assert( pBt->pPage1 && pBt->pPage1->aData ); + + if( NEVER(wrFlag && pBt->readOnly) ){ + return SQLITE_READONLY; } - rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0); - if( rc!=SQLITE_OK ){ - goto create_cursor_exception; + if( iTable==1 && pagerPagecount(pBt)==0 ){ + return SQLITE_EMPTY; } /* Now that no other errors can occur, finish filling in the BtCursor - ** variables, link the cursor into the BtShared list and set *ppCur (the - ** output argument to this function). - */ + ** variables and link the cursor into the BtShared list. */ + pCur->pgnoRoot = (Pgno)iTable; + pCur->iPage = -1; pCur->pKeyInfo = pKeyInfo; pCur->pBtree = p; pCur->pBt = pBt; - pCur->wrFlag = wrFlag; + pCur->wrFlag = (u8)wrFlag; pCur->pNext = pBt->pCursor; if( pCur->pNext ){ pCur->pNext->pPrev = pCur; } pBt->pCursor = pCur; pCur->eState = CURSOR_INVALID; - + pCur->cachedRowid = 0; return SQLITE_OK; - -create_cursor_exception: - releasePage(pCur->pPage); - unlockBtreeIfUnused(pBt); - return rc; } int sqlite3BtreeCursor( Btree *p, /* The btree */ @@ -2844,16 +3329,52 @@ int sqlite3BtreeCursor( ){ int rc; sqlite3BtreeEnter(p); - p->pBt->db = p->db; rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); sqlite3BtreeLeave(p); return rc; } -int sqlite3BtreeCursorSize(){ + +/* +** Return the size of a BtCursor object in bytes. +** +** This interfaces is needed so that users of cursors can preallocate +** sufficient storage to hold a cursor. The BtCursor object is opaque +** to users so they cannot do the sizeof() themselves - they must call +** this routine. +*/ +int sqlite3BtreeCursorSize(void){ return sizeof(BtCursor); } +/* +** Set the cached rowid value of every cursor in the same database file +** as pCur and having the same root page number as pCur. The value is +** set to iRowid. +** +** Only positive rowid values are considered valid for this cache. +** The cache is initialized to zero, indicating an invalid cache. +** A btree will work fine with zero or negative rowids. We just cannot +** cache zero or negative rowids, which means tables that use zero or +** negative rowids might run a little slower. But in practice, zero +** or negative rowids are very uncommon so this should not be a problem. +*/ +void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){ + BtCursor *p; + for(p=pCur->pBt->pCursor; p; p=p->pNext){ + if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid; + } + assert( pCur->cachedRowid==iRowid ); +} +/* +** Return the cached rowid for the given cursor. A negative or zero +** return value indicates that the rowid cache is invalid and should be +** ignored. If the rowid cache has never before been set, then a +** zero is returned. +*/ +sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){ + return pCur->cachedRowid; +} /* ** Close a cursor. The read lock on the database file is released @@ -2862,10 +3383,10 @@ int sqlite3BtreeCursorSize(){ int sqlite3BtreeCloseCursor(BtCursor *pCur){ Btree *pBtree = pCur->pBtree; if( pBtree ){ + int i; BtShared *pBt = pCur->pBt; sqlite3BtreeEnter(pBtree); - pBt->db = pBtree->db; - clearCursorPosition(pCur); + sqlite3BtreeClearCursor(pCur); if( pCur->pPrev ){ pCur->pPrev->pNext = pCur->pNext; }else{ @@ -2874,7 +3395,9 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ if( pCur->pNext ){ pCur->pNext->pPrev = pCur->pPrev; } - releasePage(pCur->pPage); + for(i=0; i<=pCur->iPage; i++){ + releasePage(pCur->apPage[i]); + } unlockBtreeIfUnused(pBt); invalidateOverflowCache(pCur); /* sqlite3_free(pCur); */ @@ -2884,37 +3407,12 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ } /* -** Make a temporary cursor by filling in the fields of pTempCur. -** The temporary cursor is not on the cursor list for the Btree. -*/ -void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){ - assert( cursorHoldsMutex(pCur) ); - memcpy(pTempCur, pCur, sizeof(*pCur)); - pTempCur->pNext = 0; - pTempCur->pPrev = 0; - if( pTempCur->pPage ){ - sqlite3PagerRef(pTempCur->pPage->pDbPage); - } -} - -/* -** Delete a temporary cursor such as was made by the CreateTemporaryCursor() -** function above. -*/ -void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ - assert( cursorHoldsMutex(pCur) ); - if( pCur->pPage ){ - sqlite3PagerUnref(pCur->pPage->pDbPage); - } -} - -/* ** Make sure the BtCursor* given in the argument has a valid ** BtCursor.info structure. If it is not already valid, call -** sqlite3BtreeParseCell() to fill it in. +** btreeParseCell() to fill it in. ** ** BtCursor.info is a cache of the information in the current cell. -** Using this cache reduces the number of calls to sqlite3BtreeParseCell(). +** Using this cache reduces the number of calls to btreeParseCell(). ** ** 2007-06-25: There is a bug in some versions of MSVC that cause the ** compiler to crash when getCellInfo() is implemented as a macro. @@ -2926,8 +3424,9 @@ void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ #ifndef NDEBUG static void assertCellInfo(BtCursor *pCur){ CellInfo info; + int iPage = pCur->iPage; memset(&info, 0, sizeof(info)); - sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info); + btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info); assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); } #else @@ -2937,7 +3436,8 @@ void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ /* Use a real function in MSVC to work around bugs in that compiler. */ static void getCellInfo(BtCursor *pCur){ if( pCur->info.nSize==0 ){ - sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); + int iPage = pCur->iPage; + btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); pCur->validNKey = 1; }else{ assertCellInfo(pCur); @@ -2945,15 +3445,27 @@ void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ } #else /* if not _MSC_VER */ /* Use a macro in all other compilers so that the function is inlined */ -#define getCellInfo(pCur) \ - if( pCur->info.nSize==0 ){ \ - sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \ - pCur->validNKey = 1; \ - }else{ \ - assertCellInfo(pCur); \ +#define getCellInfo(pCur) \ + if( pCur->info.nSize==0 ){ \ + int iPage = pCur->iPage; \ + btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \ + pCur->validNKey = 1; \ + }else{ \ + assertCellInfo(pCur); \ } #endif /* _MSC_VER */ +#ifndef NDEBUG /* The next routine used only within assert() statements */ +/* +** Return true if the given BtCursor is valid. A valid cursor is one +** that is currently pointing to a row in a (non-empty) table. +** This is a verification routine is used only within assert() statements. +*/ +int sqlite3BtreeCursorIsValid(BtCursor *pCur){ + return pCur && pCur->eState==CURSOR_VALID; +} +#endif /* NDEBUG */ + /* ** Set *pSize to the size of the buffer needed to hold the value of ** the key for the current entry. If the cursor is not pointing @@ -2961,47 +3473,41 @@ void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){ ** ** For a table with the INTKEY flag set, this routine returns the key ** itself, not the number of bytes in the key. +** +** The caller must position the cursor prior to invoking this routine. +** +** This routine cannot fail. It always returns SQLITE_OK. */ int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ - int rc; - assert( cursorHoldsMutex(pCur) ); - rc = restoreCursorPosition(pCur); - if( rc==SQLITE_OK ){ - assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); - if( pCur->eState==CURSOR_INVALID ){ - *pSize = 0; - }else{ - getCellInfo(pCur); - *pSize = pCur->info.nKey; - } + assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); + if( pCur->eState!=CURSOR_VALID ){ + *pSize = 0; + }else{ + getCellInfo(pCur); + *pSize = pCur->info.nKey; } - return rc; + return SQLITE_OK; } /* ** Set *pSize to the number of bytes of data in the entry the -** cursor currently points to. Always return SQLITE_OK. -** Failure is not possible. If the cursor is not currently -** pointing to an entry (which can happen, for example, if -** the database is empty) then *pSize is set to 0. +** cursor currently points to. +** +** The caller must guarantee that the cursor is pointing to a non-NULL +** valid entry. In other words, the calling procedure must guarantee +** that the cursor has Cursor.eState==CURSOR_VALID. +** +** Failure is not possible. This function always returns SQLITE_OK. +** It might just as well be a procedure (returning void) but we continue +** to return an integer result code for historical reasons. */ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ - int rc; - assert( cursorHoldsMutex(pCur) ); - rc = restoreCursorPosition(pCur); - if( rc==SQLITE_OK ){ - assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); - if( pCur->eState==CURSOR_INVALID ){ - /* Not pointing at a valid entry - set *pSize to 0. */ - *pSize = 0; - }else{ - getCellInfo(pCur); - *pSize = pCur->info.nData; - } - } - return rc; + assert( pCur->eState==CURSOR_VALID ); + getCellInfo(pCur); + *pSize = pCur->info.nData; + return SQLITE_OK; } /* @@ -3012,34 +3518,29 @@ int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ ** ** If an error occurs an SQLite error code is returned. Otherwise: ** -** Unless pPgnoNext is NULL, the page number of the next overflow -** page in the linked list is written to *pPgnoNext. If page ovfl -** is the last page in its linked list, *pPgnoNext is set to zero. +** The page number of the next overflow page in the linked list is +** written to *pPgnoNext. If page ovfl is the last page in its linked +** list, *pPgnoNext is set to zero. ** -** If ppPage is not NULL, *ppPage is set to the MemPage* handle -** for page ovfl. The underlying pager page may have been requested -** with the noContent flag set, so the page data accessable via -** this handle may not be trusted. +** If ppPage is not NULL, and a reference to the MemPage object corresponding +** to page number pOvfl was obtained, then *ppPage is set to point to that +** reference. It is the responsibility of the caller to call releasePage() +** on *ppPage to free the reference. In no reference was obtained (because +** the pointer-map was used to obtain the value for *pPgnoNext), then +** *ppPage is set to zero. */ static int getOverflowPage( - BtShared *pBt, - Pgno ovfl, /* Overflow page */ - MemPage **ppPage, /* OUT: MemPage handle */ + BtShared *pBt, /* The database file */ + Pgno ovfl, /* Current overflow page number */ + MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ Pgno *pPgnoNext /* OUT: Next overflow page number */ ){ Pgno next = 0; - int rc; + MemPage *pPage = 0; + int rc = SQLITE_OK; assert( sqlite3_mutex_held(pBt->mutex) ); - /* One of these must not be NULL. Otherwise, why call this function? */ - assert(ppPage || pPgnoNext); - - /* If pPgnoNext is NULL, then this function is being called to obtain - ** a MemPage* reference only. No page-data is required in this case. - */ - if( !pPgnoNext ){ - return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1); - } + assert(pPgnoNext); #ifndef SQLITE_OMIT_AUTOVACUUM /* Try to find the next page in the overflow list using the @@ -3057,36 +3558,32 @@ static int getOverflowPage( iGuess++; } - if( iGuess<=pagerPagecount(pBt->pPager) ){ + if( iGuess<=pagerPagecount(pBt) ){ rc = ptrmapGet(pBt, iGuess, &eType, &pgno); - if( rc!=SQLITE_OK ){ - return rc; - } - if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ + if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ next = iGuess; + rc = SQLITE_DONE; } } } #endif - if( next==0 || ppPage ){ - MemPage *pPage = 0; - - rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0); - assert(rc==SQLITE_OK || pPage==0); - if( next==0 && rc==SQLITE_OK ){ + assert( next==0 || rc==SQLITE_DONE ); + if( rc==SQLITE_OK ){ + rc = btreeGetPage(pBt, ovfl, &pPage, 0); + assert( rc==SQLITE_OK || pPage==0 ); + if( rc==SQLITE_OK ){ next = get4byte(pPage->aData); } - - if( ppPage ){ - *ppPage = pPage; - }else{ - releasePage(pPage); - } } - *pPgnoNext = next; - return rc; + *pPgnoNext = next; + if( ppPage ){ + *ppPage = pPage; + }else{ + releasePage(pPage); + } + return (rc==SQLITE_DONE ? SQLITE_OK : rc); } /* @@ -3131,10 +3628,8 @@ static int copyPayload( ** A total of "amt" bytes are read or written beginning at "offset". ** Data is read to or from the buffer pBuf. ** -** This routine does not make a distinction between key and data. -** It just reads or writes bytes from the payload area. Data might -** appear on the main page or be scattered out on multiple overflow -** pages. +** The content being read or written might appear on the main page +** or be scattered out on multiple overflow pages. ** ** If the BtCursor.isIncrblobHandle flag is set, and the current ** cursor entry uses one or more overflow pages, this function @@ -3153,35 +3648,32 @@ static int copyPayload( */ static int accessPayload( BtCursor *pCur, /* Cursor pointing to entry to read from */ - int offset, /* Begin reading this far into payload */ - int amt, /* Read this many bytes */ + u32 offset, /* Begin reading this far into payload */ + u32 amt, /* Read this many bytes */ unsigned char *pBuf, /* Write the bytes into this buffer */ - int skipKey, /* offset begins at data if this is true */ int eOp /* zero to read. non-zero to write. */ ){ unsigned char *aPayload; int rc = SQLITE_OK; u32 nKey; int iIdx = 0; - MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */ - BtShared *pBt; /* Btree this cursor belongs to */ + MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */ + BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ assert( pPage ); assert( pCur->eState==CURSOR_VALID ); - assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); - assert( offset>=0 ); + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); assert( cursorHoldsMutex(pCur) ); getCellInfo(pCur); aPayload = pCur->info.pCell + pCur->info.nHeader; - nKey = (pPage->intKey ? 0 : pCur->info.nKey); + nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey); - if( skipKey ){ - offset += nKey; - } - if( offset+amt > nKey+pCur->info.nData ){ + if( NEVER(offset+amt > nKey+pCur->info.nData) + || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] + ){ /* Trying to read or write past the end of the data is an error */ - return SQLITE_ERROR; + return SQLITE_CORRUPT_BKPT; } /* Check if data must be read/written to/from the btree page itself. */ @@ -3198,9 +3690,8 @@ static int accessPayload( offset -= pCur->info.nLocal; } - pBt = pCur->pBt; if( rc==SQLITE_OK && amt>0 ){ - const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ + const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ Pgno nextPage; nextPage = get4byte(&aPayload[pCur->info.nLocal]); @@ -3216,7 +3707,9 @@ static int accessPayload( if( pCur->isIncrblobHandle && !pCur->aOverflow ){ int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); - if( nOvfl && !pCur->aOverflow ){ + /* nOvfl is always positive. If it were zero, fetchPayload would have + ** been used instead of this routine. */ + if( ALWAYS(nOvfl) && !pCur->aOverflow ){ rc = SQLITE_NOMEM; } } @@ -3290,26 +3783,19 @@ static int accessPayload( ** "amt" bytes will be transfered into pBuf[]. The transfer ** begins at "offset". ** +** The caller must ensure that pCur is pointing to a valid row +** in the table. +** ** Return SQLITE_OK on success or an error code if anything goes ** wrong. An error is returned if "offset+amt" is larger than ** the available payload. */ int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ - int rc; - assert( cursorHoldsMutex(pCur) ); - rc = restoreCursorPosition(pCur); - if( rc==SQLITE_OK ){ - assert( pCur->eState==CURSOR_VALID ); - assert( pCur->pPage!=0 ); - if( pCur->pPage->intKey ){ - return SQLITE_CORRUPT_BKPT; - } - assert( pCur->pPage->intKey==0 ); - assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); - rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0); - } - return rc; + assert( pCur->eState==CURSOR_VALID ); + assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); + return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); } /* @@ -3334,9 +3820,9 @@ int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ rc = restoreCursorPosition(pCur); if( rc==SQLITE_OK ){ assert( pCur->eState==CURSOR_VALID ); - assert( pCur->pPage!=0 ); - assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); - rc = accessPayload(pCur, offset, amt, pBuf, 1, 0); + assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); + rc = accessPayload(pCur, offset, amt, pBuf, 0); } return rc; } @@ -3353,7 +3839,7 @@ int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ ** and data to fit on the local page and for there to be no overflow ** pages. When that is so, this routine can be used to access the ** key and data without making a copy. If the key and/or data spills -** onto overflow pages, then accessPayload() must be used to reassembly +** onto overflow pages, then accessPayload() must be used to reassemble ** the key/data and copy it into a preallocated buffer. ** ** The pointer returned by this routine looks directly into the cached @@ -3368,29 +3854,30 @@ static const unsigned char *fetchPayload( unsigned char *aPayload; MemPage *pPage; u32 nKey; - int nLocal; + u32 nLocal; - assert( pCur!=0 && pCur->pPage!=0 ); + assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); assert( pCur->eState==CURSOR_VALID ); assert( cursorHoldsMutex(pCur) ); - pPage = pCur->pPage; - assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); - getCellInfo(pCur); + pPage = pCur->apPage[pCur->iPage]; + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); + if( NEVER(pCur->info.nSize==0) ){ + btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage], + &pCur->info); + } aPayload = pCur->info.pCell; aPayload += pCur->info.nHeader; if( pPage->intKey ){ nKey = 0; }else{ - nKey = pCur->info.nKey; + nKey = (int)pCur->info.nKey; } if( skipKey ){ aPayload += nKey; nLocal = pCur->info.nLocal - nKey; }else{ nLocal = pCur->info.nLocal; - if( nLocal>nKey ){ - nLocal = nKey; - } + assert( nLocal<=nKey ); } *pAmt = nLocal; return aPayload; @@ -3412,68 +3899,79 @@ static const unsigned char *fetchPayload( ** in the common case where no overflow pages are used. */ const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ + const void *p = 0; + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); assert( cursorHoldsMutex(pCur) ); - if( pCur->eState==CURSOR_VALID ){ - return (const void*)fetchPayload(pCur, pAmt, 0); + if( ALWAYS(pCur->eState==CURSOR_VALID) ){ + p = (const void*)fetchPayload(pCur, pAmt, 0); } - return 0; + return p; } const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ + const void *p = 0; + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); assert( cursorHoldsMutex(pCur) ); - if( pCur->eState==CURSOR_VALID ){ - return (const void*)fetchPayload(pCur, pAmt, 1); + if( ALWAYS(pCur->eState==CURSOR_VALID) ){ + p = (const void*)fetchPayload(pCur, pAmt, 1); } - return 0; + return p; } /* ** Move the cursor down to a new child page. The newPgno argument is the ** page number of the child page to move to. +** +** This function returns SQLITE_CORRUPT if the page-header flags field of +** the new child page does not match the flags field of the parent (i.e. +** if an intkey page appears to be the parent of a non-intkey page, or +** vice-versa). */ static int moveToChild(BtCursor *pCur, u32 newPgno){ int rc; + int i = pCur->iPage; MemPage *pNewPage; - MemPage *pOldPage; BtShared *pBt = pCur->pBt; assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); - rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage); + assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); + if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ + return SQLITE_CORRUPT_BKPT; + } + rc = getAndInitPage(pBt, newPgno, &pNewPage); if( rc ) return rc; - pNewPage->idxParent = pCur->idx; - pOldPage = pCur->pPage; - pOldPage->idxShift = 0; - releasePage(pOldPage); - pCur->pPage = pNewPage; - pCur->idx = 0; + pCur->apPage[i+1] = pNewPage; + pCur->aiIdx[i+1] = 0; + pCur->iPage++; + pCur->info.nSize = 0; pCur->validNKey = 0; - if( pNewPage->nCell<1 ){ + if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){ return SQLITE_CORRUPT_BKPT; } return SQLITE_OK; } +#ifndef NDEBUG /* -** Return true if the page is the virtual root of its table. -** -** The virtual root page is the root page for most tables. But -** for the table rooted on page 1, sometime the real root page -** is empty except for the right-pointer. In such cases the -** virtual root page is the page that the right-pointer of page -** 1 is pointing to. +** Page pParent is an internal (non-leaf) tree page. This function +** asserts that page number iChild is the left-child if the iIdx'th +** cell in page pParent. Or, if iIdx is equal to the total number of +** cells in pParent, that page number iChild is the right-child of +** the page. */ -int sqlite3BtreeIsRootPage(MemPage *pPage){ - MemPage *pParent; - - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - pParent = pPage->pParent; - if( pParent==0 ) return 1; - if( pParent->pgno>1 ) return 0; - if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1; - return 0; +static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ + assert( iIdx<=pParent->nCell ); + if( iIdx==pParent->nCell ){ + assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); + }else{ + assert( get4byte(findCell(pParent, iIdx))==iChild ); + } } +#else +# define assertParentIndex(x,y,z) +#endif /* ** Move the cursor up to the parent page. @@ -3483,30 +3981,42 @@ int sqlite3BtreeIsRootPage(MemPage *pPage){ ** right-most child page then pCur->idx is set to one more than ** the largest cell index. */ -void sqlite3BtreeMoveToParent(BtCursor *pCur){ - MemPage *pParent; - MemPage *pPage; - int idxParent; - +static void moveToParent(BtCursor *pCur){ assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); - pPage = pCur->pPage; - assert( pPage!=0 ); - assert( !sqlite3BtreeIsRootPage(pPage) ); - pParent = pPage->pParent; - assert( pParent!=0 ); - idxParent = pPage->idxParent; - sqlite3PagerRef(pParent->pDbPage); - releasePage(pPage); - pCur->pPage = pParent; + assert( pCur->iPage>0 ); + assert( pCur->apPage[pCur->iPage] ); + assertParentIndex( + pCur->apPage[pCur->iPage-1], + pCur->aiIdx[pCur->iPage-1], + pCur->apPage[pCur->iPage]->pgno + ); + releasePage(pCur->apPage[pCur->iPage]); + pCur->iPage--; pCur->info.nSize = 0; pCur->validNKey = 0; - assert( pParent->idxShift==0 ); - pCur->idx = idxParent; } /* -** Move the cursor to the root page +** Move the cursor to point to the root page of its b-tree structure. +** +** If the table has a virtual root page, then the cursor is moved to point +** to the virtual root page instead of the actual root page. A table has a +** virtual root page when the actual root page contains no cells and a +** single child page. This can only happen with the table rooted at page 1. +** +** If the b-tree structure is empty, the cursor state is set to +** CURSOR_INVALID. Otherwise, the cursor is set to point to the first +** cell located on the root (or virtual root) page and the cursor state +** is set to CURSOR_VALID. +** +** If this function returns successfully, it may be assumed that the +** page-header flags indicate that the [virtual] root-page is the expected +** kind of b-tree page (i.e. if when opening the cursor the caller did not +** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, +** indicating a table b-tree, or if the caller did specify a KeyInfo +** structure the flags byte is set to 0x02 or 0x0A, indicating an index +** b-tree). */ static int moveToRoot(BtCursor *pCur){ MemPage *pRoot; @@ -3520,36 +4030,60 @@ static int moveToRoot(BtCursor *pCur){ assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); if( pCur->eState>=CURSOR_REQUIRESEEK ){ if( pCur->eState==CURSOR_FAULT ){ - return pCur->skip; + assert( pCur->skipNext!=SQLITE_OK ); + return pCur->skipNext; } - clearCursorPosition(pCur); + sqlite3BtreeClearCursor(pCur); } - pRoot = pCur->pPage; - if( pRoot && pRoot->pgno==pCur->pgnoRoot ){ - assert( pRoot->isInit ); + + if( pCur->iPage>=0 ){ + int i; + for(i=1; i<=pCur->iPage; i++){ + releasePage(pCur->apPage[i]); + } + pCur->iPage = 0; }else{ - if( - SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0)) - ){ + rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]); + if( rc!=SQLITE_OK ){ pCur->eState = CURSOR_INVALID; return rc; } - releasePage(pCur->pPage); - pCur->pPage = pRoot; + pCur->iPage = 0; + + /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor + ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is + ** NULL, the caller expects a table b-tree. If this is not the case, + ** return an SQLITE_CORRUPT error. */ + assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 ); + if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){ + return SQLITE_CORRUPT_BKPT; + } } - pCur->idx = 0; + + /* Assert that the root page is of the correct type. This must be the + ** case as the call to this function that loaded the root-page (either + ** this call or a previous invocation) would have detected corruption + ** if the assumption were not true, and it is not possible for the flags + ** byte to have been modified while this cursor is holding a reference + ** to the page. */ + pRoot = pCur->apPage[0]; + assert( pRoot->pgno==pCur->pgnoRoot ); + assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey ); + + pCur->aiIdx[0] = 0; pCur->info.nSize = 0; pCur->atLast = 0; pCur->validNKey = 0; + if( pRoot->nCell==0 && !pRoot->leaf ){ Pgno subpage; - assert( pRoot->pgno==1 ); + if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); - assert( subpage>0 ); pCur->eState = CURSOR_VALID; rc = moveToChild(pCur, subpage); + }else{ + pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID); } - pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID); return rc; } @@ -3567,9 +4101,9 @@ static int moveToLeftmost(BtCursor *pCur){ assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); - while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ - assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); - pgno = get4byte(findCell(pPage, pCur->idx)); + while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); + pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage])); rc = moveToChild(pCur, pgno); } return rc; @@ -3588,21 +4122,21 @@ static int moveToLeftmost(BtCursor *pCur){ static int moveToRightmost(BtCursor *pCur){ Pgno pgno; int rc = SQLITE_OK; - MemPage *pPage; + MemPage *pPage = 0; assert( cursorHoldsMutex(pCur) ); assert( pCur->eState==CURSOR_VALID ); - while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ + while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); - pCur->idx = pPage->nCell; + pCur->aiIdx[pCur->iPage] = pPage->nCell; rc = moveToChild(pCur, pgno); } if( rc==SQLITE_OK ){ - pCur->idx = pPage->nCell - 1; + pCur->aiIdx[pCur->iPage] = pPage->nCell-1; pCur->info.nSize = 0; pCur->validNKey = 0; } - return SQLITE_OK; + return rc; } /* Move the cursor to the first entry in the table. Return SQLITE_OK @@ -3617,11 +4151,11 @@ int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ rc = moveToRoot(pCur); if( rc==SQLITE_OK ){ if( pCur->eState==CURSOR_INVALID ){ - assert( pCur->pPage->nCell==0 ); + assert( pCur->apPage[pCur->iPage]->nCell==0 ); *pRes = 1; rc = SQLITE_OK; }else{ - assert( pCur->pPage->nCell>0 ); + assert( pCur->apPage[pCur->iPage]->nCell>0 ); *pRes = 0; rc = moveToLeftmost(pCur); } @@ -3638,184 +4172,215 @@ int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ assert( cursorHoldsMutex(pCur) ); assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); + + /* If the cursor already points to the last entry, this is a no-op. */ + if( CURSOR_VALID==pCur->eState && pCur->atLast ){ +#ifdef SQLITE_DEBUG + /* This block serves to assert() that the cursor really does point + ** to the last entry in the b-tree. */ + int ii; + for(ii=0; ii<pCur->iPage; ii++){ + assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); + } + assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 ); + assert( pCur->apPage[pCur->iPage]->leaf ); +#endif + return SQLITE_OK; + } + rc = moveToRoot(pCur); if( rc==SQLITE_OK ){ if( CURSOR_INVALID==pCur->eState ){ - assert( pCur->pPage->nCell==0 ); + assert( pCur->apPage[pCur->iPage]->nCell==0 ); *pRes = 1; }else{ assert( pCur->eState==CURSOR_VALID ); *pRes = 0; rc = moveToRightmost(pCur); - getCellInfo(pCur); - pCur->atLast = rc==SQLITE_OK; + pCur->atLast = rc==SQLITE_OK ?1:0; } } return rc; } /* Move the cursor so that it points to an entry near the key -** specified by pKey/nKey/pUnKey. Return a success code. +** specified by pIdxKey or intKey. Return a success code. ** -** For INTKEY tables, only the nKey parameter is used. pKey -** and pUnKey must be NULL. For index tables, either pUnKey -** must point to a key that has already been unpacked, or else -** pKey/nKey describes a blob containing the key. +** For INTKEY tables, the intKey parameter is used. pIdxKey +** must be NULL. For index tables, pIdxKey is used and intKey +** is ignored. ** ** If an exact match is not found, then the cursor is always ** left pointing at a leaf page which would hold the entry if it ** were present. The cursor might point to an entry that comes ** before or after the key. ** -** The result of comparing the key with the entry to which the -** cursor is written to *pRes if pRes!=NULL. The meaning of -** this value is as follows: +** An integer is written into *pRes which is the result of +** comparing the key with the entry to which the cursor is +** pointing. The meaning of the integer written into +** *pRes is as follows: ** ** *pRes<0 The cursor is left pointing at an entry that -** is smaller than pKey or if the table is empty +** is smaller than intKey/pIdxKey or if the table is empty ** and the cursor is therefore left point to nothing. ** ** *pRes==0 The cursor is left pointing at an entry that -** exactly matches pKey. +** exactly matches intKey/pIdxKey. ** ** *pRes>0 The cursor is left pointing at an entry that -** is larger than pKey. +** is larger than intKey/pIdxKey. ** */ -int sqlite3BtreeMoveto( - BtCursor *pCur, /* The cursor to be moved */ - const void *pKey, /* The key content for indices. Not used by tables */ - UnpackedRecord *pUnKey,/* Unpacked version of pKey */ - i64 nKey, /* Size of pKey. Or the key for tables */ - int biasRight, /* If true, bias the search to the high end */ - int *pRes /* Search result flag */ +int sqlite3BtreeMovetoUnpacked( + BtCursor *pCur, /* The cursor to be moved */ + UnpackedRecord *pIdxKey, /* Unpacked index key */ + i64 intKey, /* The table key */ + int biasRight, /* If true, bias the search to the high end */ + int *pRes /* Write search results here */ ){ int rc; - char aSpace[200]; assert( cursorHoldsMutex(pCur) ); assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); + assert( pRes ); + assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); /* If the cursor is already positioned at the point we are trying ** to move to, then just return without doing any work */ - if( pCur->eState==CURSOR_VALID && pCur->validNKey && pCur->pPage->intKey ){ - if( pCur->info.nKey==nKey ){ + if( pCur->eState==CURSOR_VALID && pCur->validNKey + && pCur->apPage[0]->intKey + ){ + if( pCur->info.nKey==intKey ){ *pRes = 0; return SQLITE_OK; } - if( pCur->atLast && pCur->info.nKey<nKey ){ + if( pCur->atLast && pCur->info.nKey<intKey ){ *pRes = -1; return SQLITE_OK; } } - rc = moveToRoot(pCur); if( rc ){ return rc; } - assert( pCur->pPage ); - assert( pCur->pPage->isInit ); + assert( pCur->apPage[pCur->iPage] ); + assert( pCur->apPage[pCur->iPage]->isInit ); + assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID ); if( pCur->eState==CURSOR_INVALID ){ *pRes = -1; - assert( pCur->pPage->nCell==0 ); + assert( pCur->apPage[pCur->iPage]->nCell==0 ); return SQLITE_OK; } - if( pCur->pPage->intKey ){ - /* We are given an SQL table to search. The key is the integer - ** rowid contained in nKey. pKey and pUnKey should both be NULL */ - assert( pUnKey==0 ); - assert( pKey==0 ); - }else if( pUnKey==0 ){ - /* We are to search an SQL index using a key encoded as a blob. - ** The blob is found at pKey and is nKey bytes in length. Unpack - ** this key so that we can use it. */ - assert( pKey!=0 ); - pUnKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey, - aSpace, sizeof(aSpace)); - if( pUnKey==0 ) return SQLITE_NOMEM; - }else{ - /* We are to search an SQL index using a key that is already unpacked - ** and handed to us in pUnKey. */ - assert( pKey==0 ); - } + assert( pCur->apPage[0]->intKey || pIdxKey ); for(;;){ int lwr, upr; Pgno chldPg; - MemPage *pPage = pCur->pPage; - int c = -1; /* pRes return if table is empty must be -1 */ + MemPage *pPage = pCur->apPage[pCur->iPage]; + int c; + + /* pPage->nCell must be greater than zero. If this is the root-page + ** the cursor would have been INVALID above and this for(;;) loop + ** not run. If this is not the root-page, then the moveToChild() routine + ** would have already detected db corruption. Similarly, pPage must + ** be the right kind (index or table) of b-tree page. Otherwise + ** a moveToChild() or moveToRoot() call would have detected corruption. */ + assert( pPage->nCell>0 ); + assert( pPage->intKey==(pIdxKey==0) ); lwr = 0; upr = pPage->nCell-1; - if( !pPage->intKey && pUnKey==0 ){ - rc = SQLITE_CORRUPT_BKPT; - goto moveto_finish; - } if( biasRight ){ - pCur->idx = upr; + pCur->aiIdx[pCur->iPage] = (u16)upr; }else{ - pCur->idx = (upr+lwr)/2; + pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2); } - if( lwr<=upr ) for(;;){ - void *pCellKey; - i64 nCellKey; + for(;;){ + int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */ + u8 *pCell; /* Pointer to current cell in pPage */ + pCur->info.nSize = 0; - pCur->validNKey = 1; + pCell = findCell(pPage, idx) + pPage->childPtrSize; if( pPage->intKey ){ - u8 *pCell; - pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize; + i64 nCellKey; if( pPage->hasData ){ u32 dummy; pCell += getVarint32(pCell, dummy); } getVarint(pCell, (u64*)&nCellKey); - if( nCellKey==nKey ){ + if( nCellKey==intKey ){ c = 0; - }else if( nCellKey<nKey ){ + }else if( nCellKey<intKey ){ c = -1; }else{ - assert( nCellKey>nKey ); + assert( nCellKey>intKey ); c = +1; } + pCur->validNKey = 1; + pCur->info.nKey = nCellKey; }else{ - int available; - pCellKey = (void *)fetchPayload(pCur, &available, 0); - nCellKey = pCur->info.nKey; - if( available>=nCellKey ){ - c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey); + /* The maximum supported page-size is 32768 bytes. This means that + ** the maximum number of record bytes stored on an index B-Tree + ** page is at most 8198 bytes, which may be stored as a 2-byte + ** varint. This information is used to attempt to avoid parsing + ** the entire cell by checking for the cases where the record is + ** stored entirely within the b-tree page by inspecting the first + ** 2 bytes of the cell. + */ + int nCell = pCell[0]; + if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){ + /* This branch runs if the record-size field of the cell is a + ** single byte varint and the record fits entirely on the main + ** b-tree page. */ + c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey); + }else if( !(pCell[1] & 0x80) + && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal + ){ + /* The record-size field is a 2 byte varint and the record + ** fits entirely on the main b-tree page. */ + c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey); }else{ - pCellKey = sqlite3Malloc( nCellKey ); + /* The record flows over onto one or more overflow pages. In + ** this case the whole cell needs to be parsed, a buffer allocated + ** and accessPayload() used to retrieve the record into the + ** buffer before VdbeRecordCompare() can be called. */ + void *pCellKey; + u8 * const pCellBody = pCell - pPage->childPtrSize; + btreeParseCellPtr(pPage, pCellBody, &pCur->info); + nCell = (int)pCur->info.nKey; + pCellKey = sqlite3Malloc( nCell ); if( pCellKey==0 ){ rc = SQLITE_NOMEM; goto moveto_finish; } - rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey); - c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey); + rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); + if( rc ){ + sqlite3_free(pCellKey); + goto moveto_finish; + } + c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); sqlite3_free(pCellKey); - if( rc ) goto moveto_finish; } } if( c==0 ){ - pCur->info.nKey = nCellKey; if( pPage->intKey && !pPage->leaf ){ - lwr = pCur->idx; + lwr = idx; upr = lwr - 1; break; }else{ - if( pRes ) *pRes = 0; + *pRes = 0; rc = SQLITE_OK; goto moveto_finish; } } if( c<0 ){ - lwr = pCur->idx+1; + lwr = idx+1; }else{ - upr = pCur->idx-1; + upr = idx-1; } if( lwr>upr ){ - pCur->info.nKey = nCellKey; break; } - pCur->idx = (lwr+upr)/2; + pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2); } assert( lwr==upr+1 ); assert( pPage->isInit ); @@ -3827,23 +4392,18 @@ int sqlite3BtreeMoveto( chldPg = get4byte(findCell(pPage, lwr)); } if( chldPg==0 ){ - assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); - if( pRes ) *pRes = c; + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); + *pRes = c; rc = SQLITE_OK; goto moveto_finish; } - pCur->idx = lwr; + pCur->aiIdx[pCur->iPage] = (u16)lwr; pCur->info.nSize = 0; pCur->validNKey = 0; rc = moveToChild(pCur, chldPg); if( rc ) goto moveto_finish; } moveto_finish: - if( pKey ){ - /* If we created our own unpacked key at the top of this - ** procedure, then destroy that key before returning. */ - sqlite3VdbeDeleteUnpackedRecord(pUnKey); - } return rc; } @@ -3864,14 +4424,6 @@ int sqlite3BtreeEof(BtCursor *pCur){ } /* -** Return the database connection handle for a cursor. -*/ -sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){ - assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); - return pCur->pBtree->db; -} - -/* ** Advance the cursor to the next entry in the database. If ** successful then set *pRes=0. If the cursor ** was already pointing to the last entry in the database before @@ -3879,6 +4431,7 @@ sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){ */ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ int rc; + int idx; MemPage *pPage; assert( cursorHoldsMutex(pCur) ); @@ -3887,25 +4440,25 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ return rc; } assert( pRes!=0 ); - pPage = pCur->pPage; if( CURSOR_INVALID==pCur->eState ){ *pRes = 1; return SQLITE_OK; } - if( pCur->skip>0 ){ - pCur->skip = 0; + if( pCur->skipNext>0 ){ + pCur->skipNext = 0; *pRes = 0; return SQLITE_OK; } - pCur->skip = 0; + pCur->skipNext = 0; + pPage = pCur->apPage[pCur->iPage]; + idx = ++pCur->aiIdx[pCur->iPage]; assert( pPage->isInit ); - assert( pCur->idx<pPage->nCell ); + assert( idx<=pPage->nCell ); - pCur->idx++; pCur->info.nSize = 0; pCur->validNKey = 0; - if( pCur->idx>=pPage->nCell ){ + if( idx>=pPage->nCell ){ if( !pPage->leaf ){ rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); if( rc ) return rc; @@ -3914,14 +4467,14 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ return rc; } do{ - if( sqlite3BtreeIsRootPage(pPage) ){ + if( pCur->iPage==0 ){ *pRes = 1; pCur->eState = CURSOR_INVALID; return SQLITE_OK; } - sqlite3BtreeMoveToParent(pCur); - pPage = pCur->pPage; - }while( pCur->idx>=pPage->nCell ); + moveToParent(pCur); + pPage = pCur->apPage[pCur->iPage]; + }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); *pRes = 0; if( pPage->intKey ){ rc = sqlite3BtreeNext(pCur, pRes); @@ -3947,7 +4500,6 @@ int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ */ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ int rc; - Pgno pgno; MemPage *pPage; assert( cursorHoldsMutex(pCur) ); @@ -3960,36 +4512,36 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ *pRes = 1; return SQLITE_OK; } - if( pCur->skip<0 ){ - pCur->skip = 0; + if( pCur->skipNext<0 ){ + pCur->skipNext = 0; *pRes = 0; return SQLITE_OK; } - pCur->skip = 0; + pCur->skipNext = 0; - pPage = pCur->pPage; + pPage = pCur->apPage[pCur->iPage]; assert( pPage->isInit ); - assert( pCur->idx>=0 ); if( !pPage->leaf ){ - pgno = get4byte( findCell(pPage, pCur->idx) ); - rc = moveToChild(pCur, pgno); + int idx = pCur->aiIdx[pCur->iPage]; + rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); if( rc ){ return rc; } rc = moveToRightmost(pCur); }else{ - while( pCur->idx==0 ){ - if( sqlite3BtreeIsRootPage(pPage) ){ + while( pCur->aiIdx[pCur->iPage]==0 ){ + if( pCur->iPage==0 ){ pCur->eState = CURSOR_INVALID; *pRes = 1; return SQLITE_OK; } - sqlite3BtreeMoveToParent(pCur); - pPage = pCur->pPage; + moveToParent(pCur); } - pCur->idx--; pCur->info.nSize = 0; pCur->validNKey = 0; + + pCur->aiIdx[pCur->iPage]--; + pPage = pCur->apPage[pCur->iPage]; if( pPage->intKey && !pPage->leaf ){ rc = sqlite3BtreePrevious(pCur, pRes); }else{ @@ -4030,14 +4582,20 @@ static int allocateBtreePage( ){ MemPage *pPage1; int rc; - int n; /* Number of pages on the freelist */ - int k; /* Number of leaves on the trunk of the freelist */ + u32 n; /* Number of pages on the freelist */ + u32 k; /* Number of leaves on the trunk of the freelist */ MemPage *pTrunk = 0; MemPage *pPrevTrunk = 0; + Pgno mxPage; /* Total size of the database file */ assert( sqlite3_mutex_held(pBt->mutex) ); pPage1 = pBt->pPage1; + mxPage = pagerPagecount(pBt); n = get4byte(&pPage1->aData[36]); + testcase( n==mxPage-1 ); + if( n>=mxPage ){ + return SQLITE_CORRUPT_BKPT; + } if( n>0 ){ /* There are pages on the freelist. Reuse one of those pages. */ Pgno iTrunk; @@ -4048,7 +4606,7 @@ static int allocateBtreePage( ** the entire-list will be searched for that page. */ #ifndef SQLITE_OMIT_AUTOVACUUM - if( exact && nearby<=pagerPagecount(pBt->pPager) ){ + if( exact && nearby<=mxPage ){ u8 eType; assert( nearby>0 ); assert( pBt->autoVacuum ); @@ -4079,7 +4637,12 @@ static int allocateBtreePage( }else{ iTrunk = get4byte(&pPage1->aData[32]); } - rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0); + testcase( iTrunk==mxPage ); + if( iTrunk>mxPage ){ + rc = SQLITE_CORRUPT_BKPT; + }else{ + rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); + } if( rc ){ pTrunk = 0; goto end_allocate_page; @@ -4100,7 +4663,7 @@ static int allocateBtreePage( *ppPage = pTrunk; pTrunk = 0; TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); - }else if( k>pBt->usableSize/4 - 2 ){ + }else if( k>(u32)(pBt->usableSize/4 - 2) ){ /* Value of k is out of range. Database corruption */ rc = SQLITE_CORRUPT_BKPT; goto end_allocate_page; @@ -4129,7 +4692,12 @@ static int allocateBtreePage( */ MemPage *pNewTrunk; Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); - rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); + if( iNewTrunk>mxPage ){ + rc = SQLITE_CORRUPT_BKPT; + goto end_allocate_page; + } + testcase( iNewTrunk==mxPage ); + rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); if( rc!=SQLITE_OK ){ goto end_allocate_page; } @@ -4143,6 +4711,7 @@ static int allocateBtreePage( memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); releasePage(pNewTrunk); if( !pPrevTrunk ){ + assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); put4byte(&pPage1->aData[32], iNewTrunk); }else{ rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); @@ -4155,9 +4724,9 @@ static int allocateBtreePage( pTrunk = 0; TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); #endif - }else{ + }else if( k>0 ){ /* Extract a leaf from the trunk */ - int closest; + u32 closest; Pgno iPage; unsigned char *aData = pTrunk->aData; rc = sqlite3PagerWrite(pTrunk->pDbPage); @@ -4165,7 +4734,8 @@ static int allocateBtreePage( goto end_allocate_page; } if( nearby>0 ){ - int i, dist; + u32 i; + int dist; closest = 0; dist = get4byte(&aData[8]) - nearby; if( dist<0 ) dist = -dist; @@ -4182,15 +4752,15 @@ static int allocateBtreePage( } iPage = get4byte(&aData[8+closest*4]); + testcase( iPage==mxPage ); + if( iPage>mxPage ){ + rc = SQLITE_CORRUPT_BKPT; + goto end_allocate_page; + } + testcase( iPage==mxPage ); if( !searchList || iPage==nearby ){ - int nPage; + int noContent; *pPgno = iPage; - nPage = pagerPagecount(pBt->pPager); - if( *pPgno>nPage ){ - /* Free page off the end of the file */ - rc = SQLITE_CORRUPT_BKPT; - goto end_allocate_page; - } TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" ": %d more free pages\n", *pPgno, closest+1, k, pTrunk->pgno, n-1)); @@ -4198,9 +4768,10 @@ static int allocateBtreePage( memcpy(&aData[8+closest*4], &aData[4+k*4], 4); } put4byte(&aData[4], k-1); - rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1); + assert( sqlite3PagerIswriteable(pTrunk->pDbPage) ); + noContent = !btreeGetHasContent(pBt, *pPgno); + rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); if( rc==SQLITE_OK ){ - sqlite3PagerDontRollback((*ppPage)->pDbPage); rc = sqlite3PagerWrite((*ppPage)->pDbPage); if( rc!=SQLITE_OK ){ releasePage(*ppPage); @@ -4215,37 +4786,35 @@ static int allocateBtreePage( }else{ /* There are no pages on the freelist, so create a new page at the ** end of the file */ - int nPage = pagerPagecount(pBt->pPager); + int nPage = pagerPagecount(pBt); *pPgno = nPage + 1; -#ifndef SQLITE_OMIT_AUTOVACUUM - if( pBt->nTrunc ){ - /* An incr-vacuum has already run within this transaction. So the - ** page to allocate is not from the physical end of the file, but - ** at pBt->nTrunc. - */ - *pPgno = pBt->nTrunc+1; - if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ - (*pPgno)++; - } + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ + (*pPgno)++; } + +#ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ /* If *pPgno refers to a pointer-map page, allocate two new pages ** at the end of the file instead of one. The first allocated page ** becomes a new pointer-map page, the second is used by the caller. */ + MemPage *pPg = 0; TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno)); assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); + rc = btreeGetPage(pBt, *pPgno, &pPg, 0); + if( rc==SQLITE_OK ){ + rc = sqlite3PagerWrite(pPg->pDbPage); + releasePage(pPg); + } + if( rc ) return rc; (*pPgno)++; if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; } } - if( pBt->nTrunc ){ - pBt->nTrunc = *pPgno; - } #endif assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); - rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0); + rc = btreeGetPage(pBt, *pPgno, ppPage, 0); if( rc ) return rc; rc = sqlite3PagerWrite((*ppPage)->pDbPage); if( rc!=SQLITE_OK ){ @@ -4259,38 +4828,64 @@ static int allocateBtreePage( end_allocate_page: releasePage(pTrunk); releasePage(pPrevTrunk); + if( rc==SQLITE_OK ){ + if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ + releasePage(*ppPage); + return SQLITE_CORRUPT_BKPT; + } + (*ppPage)->isInit = 0; + }else{ + *ppPage = 0; + } return rc; } /* -** Add a page of the database file to the freelist. +** This function is used to add page iPage to the database file free-list. +** It is assumed that the page is not already a part of the free-list. +** +** The value passed as the second argument to this function is optional. +** If the caller happens to have a pointer to the MemPage object +** corresponding to page iPage handy, it may pass it as the second value. +** Otherwise, it may pass NULL. ** -** sqlite3PagerUnref() is NOT called for pPage. +** If a pointer to a MemPage object is passed as the second argument, +** its reference count is not altered by this function. */ -static int freePage(MemPage *pPage){ - BtShared *pBt = pPage->pBt; - MemPage *pPage1 = pBt->pPage1; - int rc, n, k; +static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ + MemPage *pTrunk = 0; /* Free-list trunk page */ + Pgno iTrunk = 0; /* Page number of free-list trunk page */ + MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ + MemPage *pPage; /* Page being freed. May be NULL. */ + int rc; /* Return Code */ + int nFree; /* Initial number of pages on free-list */ - /* Prepare the page for freeing */ - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - assert( pPage->pgno>1 ); - pPage->isInit = 0; - releasePage(pPage->pParent); - pPage->pParent = 0; + assert( sqlite3_mutex_held(pBt->mutex) ); + assert( iPage>1 ); + assert( !pMemPage || pMemPage->pgno==iPage ); + + if( pMemPage ){ + pPage = pMemPage; + sqlite3PagerRef(pPage->pDbPage); + }else{ + pPage = btreePageLookup(pBt, iPage); + } /* Increment the free page count on pPage1 */ rc = sqlite3PagerWrite(pPage1->pDbPage); - if( rc ) return rc; - n = get4byte(&pPage1->aData[36]); - put4byte(&pPage1->aData[36], n+1); + if( rc ) goto freepage_out; + nFree = get4byte(&pPage1->aData[36]); + put4byte(&pPage1->aData[36], nFree+1); #ifdef SQLITE_SECURE_DELETE /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then ** always fully overwrite deleted information with zeros. */ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc ) return rc; + if( (!pPage && (rc = btreeGetPage(pBt, iPage, &pPage, 0))) + || (rc = sqlite3PagerWrite(pPage->pDbPage)) + ){ + goto freepage_out; + } memset(pPage->aData, 0, pPage->pBt->pageSize); #endif @@ -4298,27 +4893,35 @@ static int freePage(MemPage *pPage){ ** to indicate that the page is free. */ if( ISAUTOVACUUM ){ - rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0); - if( rc ) return rc; + ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); + if( rc ) goto freepage_out; } - if( n==0 ){ - /* This is the first free page */ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc ) return rc; - memset(pPage->aData, 0, 8); - put4byte(&pPage1->aData[32], pPage->pgno); - TRACE(("FREE-PAGE: %d first\n", pPage->pgno)); - }else{ - /* Other free pages already exist. Retrive the first trunk page - ** of the freelist and find out how many leaves it has. */ - MemPage *pTrunk; - rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0); - if( rc ) return rc; - k = get4byte(&pTrunk->aData[4]); - if( k>=pBt->usableSize/4 - 8 ){ - /* The trunk is full. Turn the page being freed into a new - ** trunk page with no leaves. + /* Now manipulate the actual database free-list structure. There are two + ** possibilities. If the free-list is currently empty, or if the first + ** trunk page in the free-list is full, then this page will become a + ** new free-list trunk page. Otherwise, it will become a leaf of the + ** first trunk page in the current free-list. This block tests if it + ** is possible to add the page as a new free-list leaf. + */ + if( nFree!=0 ){ + u32 nLeaf; /* Initial number of leaf cells on trunk page */ + + iTrunk = get4byte(&pPage1->aData[32]); + rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); + if( rc!=SQLITE_OK ){ + goto freepage_out; + } + + nLeaf = get4byte(&pTrunk->aData[4]); + assert( pBt->usableSize>32 ); + if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ + rc = SQLITE_CORRUPT_BKPT; + goto freepage_out; + } + if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ + /* In this case there is room on the trunk page to insert the page + ** being freed as a new leaf. ** ** Note that the trunk page is not really full until it contains ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have @@ -4326,37 +4929,58 @@ static int freePage(MemPage *pPage){ ** 3.6.0, databases with freelist trunk pages holding more than ** usableSize/4 - 8 entries will be reported as corrupt. In order ** to maintain backwards compatibility with older versions of SQLite, - ** we will contain to restrict the number of entries to usableSize/4 - 8 + ** we will continue to restrict the number of entries to usableSize/4 - 8 ** for now. At some point in the future (once everyone has upgraded ** to 3.6.0 or later) we should consider fixing the conditional above ** to read "usableSize/4-2" instead of "usableSize/4-8". */ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc==SQLITE_OK ){ - put4byte(pPage->aData, pTrunk->pgno); - put4byte(&pPage->aData[4], 0); - put4byte(&pPage1->aData[32], pPage->pgno); - TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", - pPage->pgno, pTrunk->pgno)); - } - }else if( k<0 ){ - rc = SQLITE_CORRUPT; - }else{ - /* Add the newly freed page as a leaf on the current trunk */ rc = sqlite3PagerWrite(pTrunk->pDbPage); if( rc==SQLITE_OK ){ - put4byte(&pTrunk->aData[4], k+1); - put4byte(&pTrunk->aData[8+k*4], pPage->pgno); + put4byte(&pTrunk->aData[4], nLeaf+1); + put4byte(&pTrunk->aData[8+nLeaf*4], iPage); #ifndef SQLITE_SECURE_DELETE - sqlite3PagerDontWrite(pPage->pDbPage); + if( pPage ){ + sqlite3PagerDontWrite(pPage->pDbPage); + } #endif + rc = btreeSetHasContent(pBt, iPage); } TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); + goto freepage_out; } - releasePage(pTrunk); } + + /* If control flows to this point, then it was not possible to add the + ** the page being freed as a leaf page of the first trunk in the free-list. + ** Possibly because the free-list is empty, or possibly because the + ** first trunk in the free-list is full. Either way, the page being freed + ** will become the new first trunk page in the free-list. + */ + if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ + goto freepage_out; + } + rc = sqlite3PagerWrite(pPage->pDbPage); + if( rc!=SQLITE_OK ){ + goto freepage_out; + } + put4byte(pPage->aData, iTrunk); + put4byte(&pPage->aData[4], 0); + put4byte(&pPage1->aData[32], iPage); + TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); + +freepage_out: + if( pPage ){ + pPage->isInit = 0; + } + releasePage(pPage); + releasePage(pTrunk); return rc; } +static void freePage(MemPage *pPage, int *pRC){ + if( (*pRC)==SQLITE_OK ){ + *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); + } +} /* ** Free any overflow pages associated with the given Cell. @@ -4367,28 +4991,37 @@ static int clearCell(MemPage *pPage, unsigned char *pCell){ Pgno ovflPgno; int rc; int nOvfl; - int ovflPageSize; + u16 ovflPageSize; assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - sqlite3BtreeParseCellPtr(pPage, pCell, &info); + btreeParseCellPtr(pPage, pCell, &info); if( info.iOverflow==0 ){ return SQLITE_OK; /* No overflow pages. Return without doing anything */ } ovflPgno = get4byte(&pCell[info.iOverflow]); + assert( pBt->usableSize > 4 ); ovflPageSize = pBt->usableSize - 4; nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; assert( ovflPgno==0 || nOvfl>0 ); while( nOvfl-- ){ - MemPage *pOvfl; - if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){ + Pgno iNext = 0; + MemPage *pOvfl = 0; + if( ovflPgno<2 || ovflPgno>pagerPagecount(pBt) ){ + /* 0 is not a legal page number and page 1 cannot be an + ** overflow page. Therefore if ovflPgno<2 or past the end of the + ** file the database must be corrupt. */ return SQLITE_CORRUPT_BKPT; } - - rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno); - if( rc ) return rc; - rc = freePage(pOvfl); - sqlite3PagerUnref(pOvfl->pDbPage); + if( nOvfl ){ + rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); + if( rc ) return rc; + } + rc = freePage2(pBt, pOvfl, ovflPgno); + if( pOvfl ){ + sqlite3PagerUnref(pOvfl->pDbPage); + } if( rc ) return rc; + ovflPgno = iNext; } return SQLITE_OK; } @@ -4428,6 +5061,11 @@ static int fillInCell( assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + /* pPage is not necessarily writeable since pCell might be auxiliary + ** buffer space that is separate from the pPage buffer area */ + assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize] + || sqlite3PagerIswriteable(pPage->pDbPage) ); + /* Fill in the header. */ nHeader = 0; if( !pPage->leaf ){ @@ -4439,10 +5077,10 @@ static int fillInCell( nData = nZero = 0; } nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); - sqlite3BtreeParseCellPtr(pPage, pCell, &info); + btreeParseCellPtr(pPage, pCell, &info); assert( info.nHeader==nHeader ); assert( info.nKey==nKey ); - assert( info.nData==nData+nZero ); + assert( info.nData==(u32)(nData+nZero) ); /* Fill in the payload */ nPayload = nData + nZero; @@ -4450,10 +5088,13 @@ static int fillInCell( pSrc = pData; nSrc = nData; nData = 0; - }else{ - nPayload += nKey; + }else{ + if( NEVER(nKey>0x7fffffff || pKey==0) ){ + return SQLITE_CORRUPT_BKPT; + } + nPayload += (int)nKey; pSrc = pKey; - nSrc = nKey; + nSrc = (int)nKey; } *pnSize = info.nSize; spaceLeft = info.nLocal; @@ -4462,7 +5103,6 @@ static int fillInCell( while( nPayload>0 ){ if( spaceLeft==0 ){ - int isExact = 0; #ifndef SQLITE_OMIT_AUTOVACUUM Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ if( pBt->autoVacuum ){ @@ -4471,12 +5111,9 @@ static int fillInCell( } while( PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) ); - if( pgnoOvfl>1 ){ - /* isExact = 1; */ - } } #endif - rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact); + rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); #ifndef SQLITE_OMIT_AUTOVACUUM /* If the database supports auto-vacuum, and the second or subsequent ** overflow page is being allocated, add an entry to the pointer-map @@ -4490,7 +5127,7 @@ static int fillInCell( */ if( pBt->autoVacuum && rc==SQLITE_OK ){ u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); - rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap); + ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); if( rc ){ releasePage(pOvfl); } @@ -4500,6 +5137,16 @@ static int fillInCell( releasePage(pToRelease); return rc; } + + /* If pToRelease is not zero than pPrior points into the data area + ** of pToRelease. Make sure pToRelease is still writeable. */ + assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); + + /* If pPrior is part of the data area of pPage, then make sure pPage + ** is still writeable */ + assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] + || sqlite3PagerIswriteable(pPage->pDbPage) ); + put4byte(pPrior, pgnoOvfl); releasePage(pToRelease); pToRelease = pOvfl; @@ -4510,6 +5157,16 @@ static int fillInCell( } n = nPayload; if( n>spaceLeft ) n = spaceLeft; + + /* If pToRelease is not zero than pPayload points into the data area + ** of pToRelease. Make sure pToRelease is still writeable. */ + assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); + + /* If pPayload is part of the data area of pPage, then make sure pPage + ** is still writeable */ + assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] + || sqlite3PagerIswriteable(pPage->pDbPage) ); + if( nSrc>0 ){ if( n>nSrc ) n = nSrc; assert( pSrc ); @@ -4531,102 +5188,6 @@ static int fillInCell( return SQLITE_OK; } - -/* -** Change the MemPage.pParent pointer on the page whose number is -** given in the second argument so that MemPage.pParent holds the -** pointer in the third argument. -** -** If the final argument, updatePtrmap, is non-zero and the database -** is an auto-vacuum database, then the pointer-map entry for pgno -** is updated. -*/ -static int reparentPage( - BtShared *pBt, /* B-Tree structure */ - Pgno pgno, /* Page number of child being adopted */ - MemPage *pNewParent, /* New parent of pgno */ - int idx, /* Index of child page pgno in pNewParent */ - int updatePtrmap /* If true, update pointer-map for pgno */ -){ - MemPage *pThis; - DbPage *pDbPage; - - assert( sqlite3_mutex_held(pBt->mutex) ); - assert( pNewParent!=0 ); - if( pgno==0 ) return SQLITE_OK; - assert( pBt->pPager!=0 ); - pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); - if( pDbPage ){ - pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage); - if( pThis->isInit ){ - assert( pThis->aData==sqlite3PagerGetData(pDbPage) ); - if( pThis->pParent!=pNewParent ){ - if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage); - pThis->pParent = pNewParent; - sqlite3PagerRef(pNewParent->pDbPage); - } - pThis->idxParent = idx; - } - sqlite3PagerUnref(pDbPage); - } - - if( ISAUTOVACUUM && updatePtrmap ){ - return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno); - } - -#ifndef NDEBUG - /* If the updatePtrmap flag was clear, assert that the entry in the - ** pointer-map is already correct. - */ - if( ISAUTOVACUUM ){ - pDbPage = sqlite3PagerLookup(pBt->pPager,PTRMAP_PAGENO(pBt,pgno)); - if( pDbPage ){ - u8 eType; - Pgno ii; - int rc = ptrmapGet(pBt, pgno, &eType, &ii); - assert( rc==SQLITE_OK && ii==pNewParent->pgno && eType==PTRMAP_BTREE ); - sqlite3PagerUnref(pDbPage); - } - } -#endif - - return SQLITE_OK; -} - - - -/* -** Change the pParent pointer of all children of pPage to point back -** to pPage. -** -** In other words, for every child of pPage, invoke reparentPage() -** to make sure that each child knows that pPage is its parent. -** -** This routine gets called after you memcpy() one page into -** another. -** -** If updatePtrmap is true, then the pointer-map entries for all child -** pages of pPage are updated. -*/ -static int reparentChildPages(MemPage *pPage, int updatePtrmap){ - int rc = SQLITE_OK; - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - if( !pPage->leaf ){ - int i; - BtShared *pBt = pPage->pBt; - Pgno iRight = get4byte(&pPage->aData[pPage->hdrOffset+8]); - - for(i=0; i<pPage->nCell; i++){ - u8 *pCell = findCell(pPage, i); - rc = reparentPage(pBt, get4byte(pCell), pPage, i, updatePtrmap); - if( rc!=SQLITE_OK ) return rc; - } - rc = reparentPage(pBt, iRight, pPage, i, updatePtrmap); - pPage->idxShift = 0; - } - return rc; -} - /* ** Remove the i-th cell from pPage. This routine effects pPage only. ** The cell content is not freed or deallocated. It is assumed that @@ -4635,11 +5196,15 @@ static int reparentChildPages(MemPage *pPage, int updatePtrmap){ ** ** "sz" must be the number of bytes in the cell. */ -static void dropCell(MemPage *pPage, int idx, int sz){ +static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ int i; /* Loop counter */ int pc; /* Offset to cell content of cell being deleted */ u8 *data; /* pPage->aData */ u8 *ptr; /* Used to move bytes around within data[] */ + int rc; /* The return code */ + int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ + + if( *pRC ) return; assert( idx>=0 && idx<pPage->nCell ); assert( sz==cellSize(pPage, idx) ); @@ -4648,16 +5213,25 @@ static void dropCell(MemPage *pPage, int idx, int sz){ data = pPage->aData; ptr = &data[pPage->cellOffset + 2*idx]; pc = get2byte(ptr); - assert( pc>10 && pc+sz<=pPage->pBt->usableSize ); - freeSpace(pPage, pc, sz); + hdr = pPage->hdrOffset; + testcase( pc==get2byte(&data[hdr+5]) ); + testcase( pc+sz==pPage->pBt->usableSize ); + if( pc < get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){ + *pRC = SQLITE_CORRUPT_BKPT; + return; + } + rc = freeSpace(pPage, pc, sz); + if( rc ){ + *pRC = rc; + return; + } for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ ptr[0] = ptr[2]; ptr[1] = ptr[3]; } pPage->nCell--; - put2byte(&data[pPage->hdrOffset+3], pPage->nCell); + put2byte(&data[hdr+3], pPage->nCell); pPage->nFree += 2; - pPage->idxShift = 1; } /* @@ -4677,25 +5251,30 @@ static void dropCell(MemPage *pPage, int idx, int sz){ ** nSkip is non-zero, then pCell may not point to an invalid memory location ** (but pCell+nSkip is always valid). */ -static int insertCell( +static void insertCell( MemPage *pPage, /* Page into which we are copying */ int i, /* New cell becomes the i-th cell of the page */ u8 *pCell, /* Content of the new cell */ int sz, /* Bytes of content in pCell */ u8 *pTemp, /* Temp storage space for pCell, if needed */ - u8 nSkip /* Do not write the first nSkip bytes of the cell */ + Pgno iChild, /* If non-zero, replace first 4 bytes with this value */ + int *pRC /* Read and write return code from here */ ){ int idx; /* Where to write new cell content in data[] */ int j; /* Loop counter */ - int top; /* First byte of content for any cell in data[] */ int end; /* First byte past the last cell pointer in data[] */ int ins; /* Index in data[] where new cell pointer is inserted */ - int hdr; /* Offset into data[] of the page header */ int cellOffset; /* Address of first cell pointer in data[] */ u8 *data; /* The content of the whole page */ u8 *ptr; /* Used for moving information around in data[] */ + int nSkip = (iChild ? 4 : 0); + + if( *pRC ) return; + assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); + assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 ); + assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) ); assert( sz==cellSizePtr(pPage, pCell) ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( pPage->nOverflow || sz+2>pPage->nFree ){ @@ -4703,59 +5282,51 @@ static int insertCell( memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); pCell = pTemp; } + if( iChild ){ + put4byte(pCell, iChild); + } j = pPage->nOverflow++; - assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) ); + assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) ); pPage->aOvfl[j].pCell = pCell; - pPage->aOvfl[j].idx = i; - pPage->nFree = 0; + pPage->aOvfl[j].idx = (u16)i; }else{ int rc = sqlite3PagerWrite(pPage->pDbPage); if( rc!=SQLITE_OK ){ - return rc; + *pRC = rc; + return; } assert( sqlite3PagerIswriteable(pPage->pDbPage) ); data = pPage->aData; - hdr = pPage->hdrOffset; - top = get2byte(&data[hdr+5]); cellOffset = pPage->cellOffset; - end = cellOffset + 2*pPage->nCell + 2; + end = cellOffset + 2*pPage->nCell; ins = cellOffset + 2*i; - if( end > top - sz ){ - defragmentPage(pPage); - top = get2byte(&data[hdr+5]); - assert( end + sz <= top ); - } - idx = allocateSpace(pPage, sz); - assert( idx>0 ); - assert( end <= get2byte(&data[hdr+5]) ); + rc = allocateSpace(pPage, sz, &idx); + if( rc ){ *pRC = rc; return; } + /* The allocateSpace() routine guarantees the following two properties + ** if it returns success */ + assert( idx >= end+2 ); + assert( idx+sz <= pPage->pBt->usableSize ); pPage->nCell++; - pPage->nFree -= 2; + pPage->nFree -= (u16)(2 + sz); memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); - for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){ + if( iChild ){ + put4byte(&data[idx], iChild); + } + for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){ ptr[0] = ptr[-2]; ptr[1] = ptr[-1]; } put2byte(&data[ins], idx); - put2byte(&data[hdr+3], pPage->nCell); - pPage->idxShift = 1; + put2byte(&data[pPage->hdrOffset+3], pPage->nCell); #ifndef SQLITE_OMIT_AUTOVACUUM if( pPage->pBt->autoVacuum ){ /* The cell may contain a pointer to an overflow page. If so, write ** the entry for the overflow page into the pointer map. */ - CellInfo info; - sqlite3BtreeParseCellPtr(pPage, pCell, &info); - assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); - if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){ - Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); - rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno); - if( rc!=SQLITE_OK ) return rc; - } + ptrmapPutOvflPtr(pPage, pCell, pRC); } #endif } - - return SQLITE_OK; } /* @@ -4769,38 +5340,33 @@ static void assemblePage( u16 *aSize /* Sizes of the cells */ ){ int i; /* Loop counter */ - int totalSize; /* Total size of all cells */ - int hdr; /* Index of page header */ - int cellptr; /* Address of next cell pointer */ + u8 *pCellptr; /* Address of next cell pointer */ int cellbody; /* Address of next cell body */ - u8 *data; /* Data for the page */ + u8 * const data = pPage->aData; /* Pointer to data for pPage */ + const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ + const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ assert( pPage->nOverflow==0 ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - totalSize = 0; - for(i=0; i<nCell; i++){ - totalSize += aSize[i]; - } - assert( totalSize+2*nCell<=pPage->nFree ); + assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 ); + assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + + /* Check that the page has just been zeroed by zeroPage() */ assert( pPage->nCell==0 ); - cellptr = pPage->cellOffset; - data = pPage->aData; - hdr = pPage->hdrOffset; - put2byte(&data[hdr+3], nCell); - if( nCell ){ - cellbody = allocateSpace(pPage, totalSize); - assert( cellbody>0 ); - assert( pPage->nFree >= 2*nCell ); - pPage->nFree -= 2*nCell; - for(i=0; i<nCell; i++){ - put2byte(&data[cellptr], cellbody); - memcpy(&data[cellbody], apCell[i], aSize[i]); - cellptr += 2; - cellbody += aSize[i]; - } - assert( cellbody==pPage->pBt->usableSize ); + assert( get2byte(&data[hdr+5])==nUsable ); + + pCellptr = &data[pPage->cellOffset + nCell*2]; + cellbody = nUsable; + for(i=nCell-1; i>=0; i--){ + pCellptr -= 2; + cellbody -= aSize[i]; + put2byte(pCellptr, cellbody); + memcpy(&data[cellbody], apCell[i], aSize[i]); } - pPage->nCell = nCell; + put2byte(&data[hdr+3], nCell); + put2byte(&data[hdr+5], cellbody); + pPage->nFree -= (nCell*2 + nUsable - cellbody); + pPage->nCell = (u16)nCell; } /* @@ -4818,8 +5384,6 @@ static void assemblePage( #define NN 1 /* Number of neighbors on either side of pPage */ #define NB (NN*2+1) /* Total pages involved in the balance */ -/* Forward reference */ -static int balance(MemPage*, int); #ifndef SQLITE_OMIT_QUICKBALANCE /* @@ -4828,7 +5392,7 @@ static int balance(MemPage*, int); ** tree, in other words, when the new entry will become the largest ** entry in the tree. ** -** Instead of trying balance the 3 right-most leaf pages, just add +** Instead of trying to balance the 3 right-most leaf pages, just add ** a new page to the right-hand side and put the one new entry in ** that page. This leaves the right side of the tree somewhat ** unbalanced. But odds are that we will be inserting new entries @@ -4838,254 +5402,351 @@ static int balance(MemPage*, int); ** pPage is the leaf page which is the right-most page in the tree. ** pParent is its parent. pPage must have a single overflow entry ** which is also the right-most entry on the page. +** +** The pSpace buffer is used to store a temporary copy of the divider +** cell that will be inserted into pParent. Such a cell consists of a 4 +** byte page number followed by a variable length integer. In other +** words, at most 13 bytes. Hence the pSpace buffer must be at +** least 13 bytes in size. */ -static int balance_quick(MemPage *pPage, MemPage *pParent){ - int rc; - MemPage *pNew; - Pgno pgnoNew; - u8 *pCell; - u16 szCell; - CellInfo info; - BtShared *pBt = pPage->pBt; - int parentIdx = pParent->nCell; /* pParent new divider cell index */ - int parentSize; /* Size of new divider cell */ - u8 parentCell[64]; /* Space for the new divider cell */ +static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ + BtShared *const pBt = pPage->pBt; /* B-Tree Database */ + MemPage *pNew; /* Newly allocated page */ + int rc; /* Return Code */ + Pgno pgnoNew; /* Page number of pNew */ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + assert( sqlite3PagerIswriteable(pParent->pDbPage) ); + assert( pPage->nOverflow==1 ); - /* Allocate a new page. Insert the overflow cell from pPage - ** into it. Then remove the overflow cell from pPage. + if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT; + + /* Allocate a new page. This page will become the right-sibling of + ** pPage. Make the parent page writable, so that the new divider cell + ** may be inserted. If both these operations are successful, proceed. */ rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); - if( rc!=SQLITE_OK ){ - return rc; + + if( rc==SQLITE_OK ){ + + u8 *pOut = &pSpace[4]; + u8 *pCell = pPage->aOvfl[0].pCell; + u16 szCell = cellSizePtr(pPage, pCell); + u8 *pStop; + + assert( sqlite3PagerIswriteable(pNew->pDbPage) ); + assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); + zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); + assemblePage(pNew, 1, &pCell, &szCell); + + /* If this is an auto-vacuum database, update the pointer map + ** with entries for the new page, and any pointer from the + ** cell on the page to an overflow page. If either of these + ** operations fails, the return code is set, but the contents + ** of the parent page are still manipulated by thh code below. + ** That is Ok, at this point the parent page is guaranteed to + ** be marked as dirty. Returning an error code will cause a + ** rollback, undoing any changes made to the parent page. + */ + if( ISAUTOVACUUM ){ + ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); + if( szCell>pNew->minLocal ){ + ptrmapPutOvflPtr(pNew, pCell, &rc); + } + } + + /* Create a divider cell to insert into pParent. The divider cell + ** consists of a 4-byte page number (the page number of pPage) and + ** a variable length key value (which must be the same value as the + ** largest key on pPage). + ** + ** To find the largest key value on pPage, first find the right-most + ** cell on pPage. The first two fields of this cell are the + ** record-length (a variable length integer at most 32-bits in size) + ** and the key value (a variable length integer, may have any value). + ** The first of the while(...) loops below skips over the record-length + ** field. The second while(...) loop copies the key value from the + ** cell on pPage into the pSpace buffer. + */ + pCell = findCell(pPage, pPage->nCell-1); + pStop = &pCell[9]; + while( (*(pCell++)&0x80) && pCell<pStop ); + pStop = &pCell[9]; + while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); + + /* Insert the new divider cell into pParent. */ + insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), + 0, pPage->pgno, &rc); + + /* Set the right-child pointer of pParent to point to the new page. */ + put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); + + /* Release the reference to the new page. */ + releasePage(pNew); } - pCell = pPage->aOvfl[0].pCell; - szCell = cellSizePtr(pPage, pCell); - zeroPage(pNew, pPage->aData[0]); - assemblePage(pNew, 1, &pCell, &szCell); - pPage->nOverflow = 0; - /* Set the parent of the newly allocated page to pParent. */ - pNew->pParent = pParent; - sqlite3PagerRef(pParent->pDbPage); + return rc; +} +#endif /* SQLITE_OMIT_QUICKBALANCE */ - /* pPage is currently the right-child of pParent. Change this - ** so that the right-child is the new page allocated above and - ** pPage is the next-to-right child. - ** - ** Ignore the return value of the call to fillInCell(). fillInCell() - ** may only return other than SQLITE_OK if it is required to allocate - ** one or more overflow pages. Since an internal table B-Tree cell - ** may never spill over onto an overflow page (it is a maximum of - ** 13 bytes in size), it is not neccessary to check the return code. - ** - ** Similarly, the insertCell() function cannot fail if the page - ** being inserted into is already writable and the cell does not - ** contain an overflow pointer. So ignore this return code too. - */ - assert( pPage->nCell>0 ); - pCell = findCell(pPage, pPage->nCell-1); - sqlite3BtreeParseCellPtr(pPage, pCell, &info); - fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize); - assert( parentSize<64 ); - assert( sqlite3PagerIswriteable(pParent->pDbPage) ); - insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4); - put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno); - put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); +#if 0 +/* +** This function does not contribute anything to the operation of SQLite. +** it is sometimes activated temporarily while debugging code responsible +** for setting pointer-map entries. +*/ +static int ptrmapCheckPages(MemPage **apPage, int nPage){ + int i, j; + for(i=0; i<nPage; i++){ + Pgno n; + u8 e; + MemPage *pPage = apPage[i]; + BtShared *pBt = pPage->pBt; + assert( pPage->isInit ); - /* If this is an auto-vacuum database, update the pointer map - ** with entries for the new page, and any pointer from the - ** cell on the page to an overflow page. - */ - if( ISAUTOVACUUM ){ - rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); - if( rc==SQLITE_OK ){ - rc = ptrmapPutOvfl(pNew, 0); + for(j=0; j<pPage->nCell; j++){ + CellInfo info; + u8 *z; + + z = findCell(pPage, j); + btreeParseCellPtr(pPage, z, &info); + if( info.iOverflow ){ + Pgno ovfl = get4byte(&z[info.iOverflow]); + ptrmapGet(pBt, ovfl, &e, &n); + assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); + } + if( !pPage->leaf ){ + Pgno child = get4byte(z); + ptrmapGet(pBt, child, &e, &n); + assert( n==pPage->pgno && e==PTRMAP_BTREE ); + } } - if( rc!=SQLITE_OK ){ - releasePage(pNew); - return rc; + if( !pPage->leaf ){ + Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); + ptrmapGet(pBt, child, &e, &n); + assert( n==pPage->pgno && e==PTRMAP_BTREE ); } } + return 1; +} +#endif - /* Release the reference to the new page and balance the parent page, - ** in case the divider cell inserted caused it to become overfull. - */ - releasePage(pNew); - return balance(pParent, 0); +/* +** This function is used to copy the contents of the b-tree node stored +** on page pFrom to page pTo. If page pFrom was not a leaf page, then +** the pointer-map entries for each child page are updated so that the +** parent page stored in the pointer map is page pTo. If pFrom contained +** any cells with overflow page pointers, then the corresponding pointer +** map entries are also updated so that the parent page is page pTo. +** +** If pFrom is currently carrying any overflow cells (entries in the +** MemPage.aOvfl[] array), they are not copied to pTo. +** +** Before returning, page pTo is reinitialized using btreeInitPage(). +** +** The performance of this function is not critical. It is only used by +** the balance_shallower() and balance_deeper() procedures, neither of +** which are called often under normal circumstances. +*/ +static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ + if( (*pRC)==SQLITE_OK ){ + BtShared * const pBt = pFrom->pBt; + u8 * const aFrom = pFrom->aData; + u8 * const aTo = pTo->aData; + int const iFromHdr = pFrom->hdrOffset; + int const iToHdr = ((pTo->pgno==1) ? 100 : 0); + TESTONLY(int rc;) + int iData; + + + assert( pFrom->isInit ); + assert( pFrom->nFree>=iToHdr ); + assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize ); + + /* Copy the b-tree node content from page pFrom to page pTo. */ + iData = get2byte(&aFrom[iFromHdr+5]); + memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); + memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); + + /* Reinitialize page pTo so that the contents of the MemPage structure + ** match the new data. The initialization of pTo "cannot" fail, as the + ** data copied from pFrom is known to be valid. */ + pTo->isInit = 0; + TESTONLY(rc = ) btreeInitPage(pTo); + assert( rc==SQLITE_OK ); + + /* If this is an auto-vacuum database, update the pointer-map entries + ** for any b-tree or overflow pages that pTo now contains the pointers to. + */ + if( ISAUTOVACUUM ){ + *pRC = setChildPtrmaps(pTo); + } + } } -#endif /* SQLITE_OMIT_QUICKBALANCE */ /* -** This routine redistributes Cells on pPage and up to NN*2 siblings -** of pPage so that all pages have about the same amount of free space. -** Usually NN siblings on either side of pPage is used in the balancing, -** though more siblings might come from one side if pPage is the first -** or last child of its parent. If pPage has fewer than 2*NN siblings -** (something which can only happen if pPage is the root page or a -** child of root) then all available siblings participate in the balancing. +** This routine redistributes cells on the iParentIdx'th child of pParent +** (hereafter "the page") and up to 2 siblings so that all pages have about the +** same amount of free space. Usually a single sibling on either side of the +** page are used in the balancing, though both siblings might come from one +** side if the page is the first or last child of its parent. If the page +** has fewer than 2 siblings (something which can only happen if the page +** is a root page or a child of a root page) then all available siblings +** participate in the balancing. ** -** The number of siblings of pPage might be increased or decreased by one or -** two in an effort to keep pages nearly full but not over full. The root page -** is special and is allowed to be nearly empty. If pPage is -** the root page, then the depth of the tree might be increased -** or decreased by one, as necessary, to keep the root page from being -** overfull or completely empty. +** The number of siblings of the page might be increased or decreased by +** one or two in an effort to keep pages nearly full but not over full. ** -** Note that when this routine is called, some of the Cells on pPage -** might not actually be stored in pPage->aData[]. This can happen -** if the page is overfull. Part of the job of this routine is to -** make sure all Cells for pPage once again fit in pPage->aData[]. +** Note that when this routine is called, some of the cells on the page +** might not actually be stored in MemPage.aData[]. This can happen +** if the page is overfull. This routine ensures that all cells allocated +** to the page and its siblings fit into MemPage.aData[] before returning. ** -** In the course of balancing the siblings of pPage, the parent of pPage -** might become overfull or underfull. If that happens, then this routine -** is called recursively on the parent. +** In the course of balancing the page and its siblings, cells may be +** inserted into or removed from the parent page (pParent). Doing so +** may cause the parent page to become overfull or underfull. If this +** happens, it is the responsibility of the caller to invoke the correct +** balancing routine to fix this problem (see the balance() routine). ** ** If this routine fails for any reason, it might leave the database -** in a corrupted state. So if this routine fails, the database should +** in a corrupted state. So if this routine fails, the database should ** be rolled back. -*/ -static int balance_nonroot(MemPage *pPage){ - MemPage *pParent; /* The parent of pPage */ +** +** The third argument to this function, aOvflSpace, is a pointer to a +** buffer big enough to hold one page. If while inserting cells into the parent +** page (pParent) the parent page becomes overfull, this buffer is +** used to store the parent's overflow cells. Because this function inserts +** a maximum of four divider cells into the parent page, and the maximum +** size of a cell stored within an internal node is always less than 1/4 +** of the page-size, the aOvflSpace[] buffer is guaranteed to be large +** enough for all overflow cells. +** +** If aOvflSpace is set to a null pointer, this function returns +** SQLITE_NOMEM. +*/ +static int balance_nonroot( + MemPage *pParent, /* Parent page of siblings being balanced */ + int iParentIdx, /* Index of "the page" in pParent */ + u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ + int isRoot /* True if pParent is a root-page */ +){ BtShared *pBt; /* The whole database */ int nCell = 0; /* Number of cells in apCell[] */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ + int nNew = 0; /* Number of pages in apNew[] */ int nOld; /* Number of pages in apOld[] */ - int nNew; /* Number of pages in apNew[] */ - int nDiv; /* Number of cells in apDiv[] */ int i, j, k; /* Loop counters */ - int idx; /* Index of pPage in pParent->aCell[] */ int nxDiv; /* Next divider slot in pParent->aCell[] */ - int rc; /* The return code */ - int leafCorrection; /* 4 if pPage is a leaf. 0 if not */ + int rc = SQLITE_OK; /* The return code */ + u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ int usableSpace; /* Bytes in pPage beyond the header */ int pageFlags; /* Value of pPage->aData[0] */ int subtotal; /* Subtotal of bytes in cells on one page */ int iSpace1 = 0; /* First unused byte of aSpace1[] */ - int iSpace2 = 0; /* First unused byte of aSpace2[] */ + int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ int szScratch; /* Size of scratch memory requested */ MemPage *apOld[NB]; /* pPage and up to two siblings */ - Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ - Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */ - u8 *apDiv[NB]; /* Divider cells in pParent */ + u8 *pRight; /* Location in parent of right-sibling pointer */ + u8 *apDiv[NB-1]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ int szNew[NB+2]; /* Combined size of cells place on i-th page */ u8 **apCell = 0; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ - u8 *aCopy[NB]; /* Space for holding data of apCopy[] */ - u8 *aSpace1; /* Space for copies of dividers cells before balance */ - u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */ - u8 *aFrom = 0; + u8 *aSpace1; /* Space for copies of dividers cells */ + Pgno pgno; /* Temp var to store a page number in */ - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - - /* - ** Find the parent page. - */ - assert( pPage->isInit ); - assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 ); - pBt = pPage->pBt; - pParent = pPage->pParent; - assert( pParent ); - if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){ - return rc; - } + pBt = pParent->pBt; + assert( sqlite3_mutex_held(pBt->mutex) ); + assert( sqlite3PagerIswriteable(pParent->pDbPage) ); +#if 0 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); +#endif -#ifndef SQLITE_OMIT_QUICKBALANCE - /* - ** A special case: If a new entry has just been inserted into a - ** table (that is, a btree with integer keys and all data at the leaves) - ** and the new entry is the right-most entry in the tree (it has the - ** largest key) then use the special balance_quick() routine for - ** balancing. balance_quick() is much faster and results in a tighter - ** packing of data in the common case. + /* At this point pParent may have at most one overflow cell. And if + ** this overflow cell is present, it must be the cell with + ** index iParentIdx. This scenario comes about when this function + ** is called (indirectly) from sqlite3BtreeDelete(). */ - if( pPage->leaf && - pPage->intKey && - pPage->nOverflow==1 && - pPage->aOvfl[0].idx==pPage->nCell && - pPage->pParent->pgno!=1 && - get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno - ){ - assert( pPage->intKey ); - /* - ** TODO: Check the siblings to the left of pPage. It may be that - ** they are not full and no new page is required. - */ - return balance_quick(pPage, pParent); - } -#endif + assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); + assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx ); - if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){ - return rc; + if( !aOvflSpace ){ + return SQLITE_NOMEM; } - /* - ** Find the cell in the parent page whose left child points back - ** to pPage. The "idx" variable is the index of that cell. If pPage - ** is the rightmost child of pParent then set idx to pParent->nCell + /* Find the sibling pages to balance. Also locate the cells in pParent + ** that divide the siblings. An attempt is made to find NN siblings on + ** either side of pPage. More siblings are taken from one side, however, + ** if there are fewer than NN siblings on the other side. If pParent + ** has NB or fewer children then all children of pParent are taken. + ** + ** This loop also drops the divider cells from the parent page. This + ** way, the remainder of the function does not have to deal with any + ** overflow cells in the parent page, since if any existed they will + ** have already been removed. */ - if( pParent->idxShift ){ - Pgno pgno; - pgno = pPage->pgno; - assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); - for(idx=0; idx<pParent->nCell; idx++){ - if( get4byte(findCell(pParent, idx))==pgno ){ - break; - } + i = pParent->nOverflow + pParent->nCell; + if( i<2 ){ + nxDiv = 0; + nOld = i+1; + }else{ + nOld = 3; + if( iParentIdx==0 ){ + nxDiv = 0; + }else if( iParentIdx==i ){ + nxDiv = i-2; + }else{ + nxDiv = iParentIdx-1; } - assert( idx<pParent->nCell - || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno ); + i = 2; + } + if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ + pRight = &pParent->aData[pParent->hdrOffset+8]; }else{ - idx = pPage->idxParent; + pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); } + pgno = get4byte(pRight); + while( 1 ){ + rc = getAndInitPage(pBt, pgno, &apOld[i]); + if( rc ){ + memset(apOld, 0, (i+1)*sizeof(MemPage*)); + goto balance_cleanup; + } + nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; + if( (i--)==0 ) break; - /* - ** Initialize variables so that it will be safe to jump - ** directly to balance_cleanup at any moment. - */ - nOld = nNew = 0; - sqlite3PagerRef(pParent->pDbPage); - - /* - ** Find sibling pages to pPage and the cells in pParent that divide - ** the siblings. An attempt is made to find NN siblings on either - ** side of pPage. More siblings are taken from one side, however, if - ** pPage there are fewer than NN siblings on the other side. If pParent - ** has NB or fewer children then all children of pParent are taken. - */ - nxDiv = idx - NN; - if( nxDiv + NB > pParent->nCell ){ - nxDiv = pParent->nCell - NB + 1; - } - if( nxDiv<0 ){ - nxDiv = 0; - } - nDiv = 0; - for(i=0, k=nxDiv; i<NB; i++, k++){ - if( k<pParent->nCell ){ - apDiv[i] = findCell(pParent, k); - nDiv++; - assert( !pParent->leaf ); - pgnoOld[i] = get4byte(apDiv[i]); - }else if( k==pParent->nCell ){ - pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]); + if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){ + apDiv[i] = pParent->aOvfl[0].pCell; + pgno = get4byte(apDiv[i]); + szNew[i] = cellSizePtr(pParent, apDiv[i]); + pParent->nOverflow = 0; }else{ - break; + apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); + pgno = get4byte(apDiv[i]); + szNew[i] = cellSizePtr(pParent, apDiv[i]); + + /* Drop the cell from the parent page. apDiv[i] still points to + ** the cell within the parent, even though it has been dropped. + ** This is safe because dropping a cell only overwrites the first + ** four bytes of it, and this function does not need the first + ** four bytes of the divider cell. So the pointer is safe to use + ** later on. + ** + ** Unless SQLite is compiled in secure-delete mode. In this case, + ** the dropCell() routine will overwrite the entire cell with zeroes. + ** In this case, temporarily copy the cell into the aOvflSpace[] + ** buffer. It will be copied out again as soon as the aSpace[] buffer + ** is allocated. */ +#ifdef SQLITE_SECURE_DELETE + memcpy(&aOvflSpace[apDiv[i]-pParent->aData], apDiv[i], szNew[i]); + apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; +#endif + dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); } - rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent); - if( rc ) goto balance_cleanup; - apOld[i]->idxParent = k; - apCopy[i] = 0; - assert( i==nOld ); - nOld++; - nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; } /* Make nMaxCells a multiple of 4 in order to preserve 8-byte @@ -5095,52 +5756,25 @@ static int balance_nonroot(MemPage *pPage){ /* ** Allocate space for memory structures */ + k = pBt->pageSize + ROUND8(sizeof(MemPage)); szScratch = nMaxCells*sizeof(u8*) /* apCell */ + nMaxCells*sizeof(u16) /* szCell */ - + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */ + pBt->pageSize /* aSpace1 */ - + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */ + + k*nOld; /* Page copies (apCopy) */ apCell = sqlite3ScratchMalloc( szScratch ); if( apCell==0 ){ rc = SQLITE_NOMEM; goto balance_cleanup; } szCell = (u16*)&apCell[nMaxCells]; - aCopy[0] = (u8*)&szCell[nMaxCells]; - assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ - for(i=1; i<NB; i++){ - aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; - assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ - } - aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; - assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */ - if( ISAUTOVACUUM ){ - aFrom = &aSpace1[pBt->pageSize]; - } - aSpace2 = sqlite3PageMalloc(pBt->pageSize); - if( aSpace2==0 ){ - rc = SQLITE_NOMEM; - goto balance_cleanup; - } - - /* - ** Make copies of the content of pPage and its siblings into aOld[]. - ** The rest of this function will use data from the copies rather - ** that the original pages since the original pages will be in the - ** process of being overwritten. - */ - for(i=0; i<nOld; i++){ - MemPage *p = apCopy[i] = (MemPage*)aCopy[i]; - memcpy(p, apOld[i], sizeof(MemPage)); - p->aData = (void*)&p[1]; - memcpy(p->aData, apOld[i]->aData, pBt->pageSize); - } + aSpace1 = (u8*)&szCell[nMaxCells]; + assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); /* ** Load pointers to all cells on sibling pages and the divider cells ** into the local apCell[] array. Make copies of the divider cells - ** into space obtained form aSpace1[] and remove the the divider Cells + ** into space obtained from aSpace1[] and remove the the divider Cells ** from pParent. ** ** If the siblings are on leaf pages, then the child pointers of the @@ -5153,67 +5787,54 @@ static int balance_nonroot(MemPage *pPage){ ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. ** leafData: 1 if pPage holds key+data and pParent holds only keys. */ - nCell = 0; - leafCorrection = pPage->leaf*4; - leafData = pPage->hasData; + leafCorrection = apOld[0]->leaf*4; + leafData = apOld[0]->hasData; for(i=0; i<nOld; i++){ - MemPage *pOld = apCopy[i]; - int limit = pOld->nCell+pOld->nOverflow; + int limit; + + /* Before doing anything else, take a copy of the i'th original sibling + ** The rest of this function will use data from the copies rather + ** that the original pages since the original pages will be in the + ** process of being overwritten. */ + MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; + memcpy(pOld, apOld[i], sizeof(MemPage)); + pOld->aData = (void*)&pOld[1]; + memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); + + limit = pOld->nCell+pOld->nOverflow; for(j=0; j<limit; j++){ assert( nCell<nMaxCells ); apCell[nCell] = findOverflowCell(pOld, j); szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); - if( ISAUTOVACUUM ){ - int a; - aFrom[nCell] = i; - for(a=0; a<pOld->nOverflow; a++){ - if( pOld->aOvfl[a].pCell==apCell[nCell] ){ - aFrom[nCell] = 0xFF; - break; - } - } - } nCell++; } - if( i<nOld-1 ){ - u16 sz = cellSizePtr(pParent, apDiv[i]); - if( leafData ){ - /* With the LEAFDATA flag, pParent cells hold only INTKEYs that - ** are duplicates of keys on the child pages. We need to remove - ** the divider cells from pParent, but the dividers cells are not - ** added to apCell[] because they are duplicates of child cells. - */ - dropCell(pParent, nxDiv, sz); + if( i<nOld-1 && !leafData){ + u16 sz = (u16)szNew[i]; + u8 *pTemp; + assert( nCell<nMaxCells ); + szCell[nCell] = sz; + pTemp = &aSpace1[iSpace1]; + iSpace1 += sz; + assert( sz<=pBt->pageSize/4 ); + assert( iSpace1<=pBt->pageSize ); + memcpy(pTemp, apDiv[i], sz); + apCell[nCell] = pTemp+leafCorrection; + assert( leafCorrection==0 || leafCorrection==4 ); + szCell[nCell] = szCell[nCell] - leafCorrection; + if( !pOld->leaf ){ + assert( leafCorrection==0 ); + assert( pOld->hdrOffset==0 ); + /* The right pointer of the child page pOld becomes the left + ** pointer of the divider cell */ + memcpy(apCell[nCell], &pOld->aData[8], 4); }else{ - u8 *pTemp; - assert( nCell<nMaxCells ); - szCell[nCell] = sz; - pTemp = &aSpace1[iSpace1]; - iSpace1 += sz; - assert( sz<=pBt->pageSize/4 ); - assert( iSpace1<=pBt->pageSize ); - memcpy(pTemp, apDiv[i], sz); - apCell[nCell] = pTemp+leafCorrection; - if( ISAUTOVACUUM ){ - aFrom[nCell] = 0xFF; + assert( leafCorrection==4 ); + if( szCell[nCell]<4 ){ + /* Do not allow any cells smaller than 4 bytes. */ + szCell[nCell] = 4; } - dropCell(pParent, nxDiv, sz); - szCell[nCell] -= leafCorrection; - assert( get4byte(pTemp)==pgnoOld[i] ); - if( !pOld->leaf ){ - assert( leafCorrection==0 ); - /* The right pointer of the child page pOld becomes the left - ** pointer of the divider cell */ - memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4); - }else{ - assert( leafCorrection==4 ); - if( szCell[nCell]<4 ){ - /* Do not allow any cells smaller than 4 bytes. */ - szCell[nCell] = 4; - } - } - nCell++; } + nCell++; } } @@ -5243,6 +5864,7 @@ static int balance_nonroot(MemPage *pPage){ if( leafData ){ i--; } subtotal = 0; k++; + if( k>NB+1 ){ rc = SQLITE_CORRUPT; goto balance_cleanup; } } } szNew[k] = subtotal; @@ -5280,39 +5902,55 @@ static int balance_nonroot(MemPage *pPage){ szNew[i-1] = szLeft; } - /* Either we found one or more cells (cntnew[0])>0) or we are the + /* Either we found one or more cells (cntnew[0])>0) or pPage is ** a virtual root page. A virtual root page is when the real root ** page is page 1 and we are the only child of that page. */ assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); + TRACE(("BALANCE: old: %d %d %d ", + apOld[0]->pgno, + nOld>=2 ? apOld[1]->pgno : 0, + nOld>=3 ? apOld[2]->pgno : 0 + )); + /* ** Allocate k new pages. Reuse old pages where possible. */ - assert( pPage->pgno>1 ); - pageFlags = pPage->aData[0]; + if( apOld[0]->pgno<=1 ){ + rc = SQLITE_CORRUPT; + goto balance_cleanup; + } + pageFlags = apOld[0]->aData[0]; for(i=0; i<k; i++){ MemPage *pNew; if( i<nOld ){ pNew = apNew[i] = apOld[i]; - pgnoNew[i] = pgnoOld[i]; apOld[i] = 0; rc = sqlite3PagerWrite(pNew->pDbPage); nNew++; if( rc ) goto balance_cleanup; }else{ assert( i>0 ); - rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0); + rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0); if( rc ) goto balance_cleanup; apNew[i] = pNew; nNew++; + + /* Set the pointer-map entry for the new sibling page. */ + if( ISAUTOVACUUM ){ + ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); + if( rc!=SQLITE_OK ){ + goto balance_cleanup; + } + } } } /* Free any old pages that were not reused as new pages. */ while( i<nOld ){ - rc = freePage(apOld[i]); + freePage(apOld[i], &rc); if( rc ) goto balance_cleanup; releasePage(apOld[i]); apOld[i] = 0; @@ -5334,34 +5972,32 @@ static int balance_nonroot(MemPage *pPage){ ** about 25% faster for large insertions and deletions. */ for(i=0; i<k-1; i++){ - int minV = pgnoNew[i]; + int minV = apNew[i]->pgno; int minI = i; for(j=i+1; j<k; j++){ - if( pgnoNew[j]<(unsigned)minV ){ + if( apNew[j]->pgno<(unsigned)minV ){ minI = j; - minV = pgnoNew[j]; + minV = apNew[j]->pgno; } } if( minI>i ){ int t; MemPage *pT; - t = pgnoNew[i]; + t = apNew[i]->pgno; pT = apNew[i]; - pgnoNew[i] = pgnoNew[minI]; apNew[i] = apNew[minI]; - pgnoNew[minI] = t; apNew[minI] = pT; } } - TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", - pgnoOld[0], - nOld>=2 ? pgnoOld[1] : 0, - nOld>=3 ? pgnoOld[2] : 0, - pgnoNew[0], szNew[0], - nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0, - nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0, - nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0, - nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0)); + TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", + apNew[0]->pgno, szNew[0], + nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, + nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, + nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, + nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); + + assert( sqlite3PagerIswriteable(pParent->pDbPage) ); + put4byte(pRight, apNew[nNew-1]->pgno); /* ** Evenly distribute the data in apCell[] across the new pages. @@ -5372,38 +6008,18 @@ static int balance_nonroot(MemPage *pPage){ /* Assemble the new sibling page. */ MemPage *pNew = apNew[i]; assert( j<nMaxCells ); - assert( pNew->pgno==pgnoNew[i] ); zeroPage(pNew, pageFlags); assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); assert( pNew->nOverflow==0 ); - /* If this is an auto-vacuum database, update the pointer map entries - ** that point to the siblings that were rearranged. These can be: left - ** children of cells, the right-child of the page, or overflow pages - ** pointed to by cells. - */ - if( ISAUTOVACUUM ){ - for(k=j; k<cntNew[i]; k++){ - assert( k<nMaxCells ); - if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){ - rc = ptrmapPutOvfl(pNew, k-j); - if( rc==SQLITE_OK && leafCorrection==0 ){ - rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno); - } - if( rc!=SQLITE_OK ){ - goto balance_cleanup; - } - } - } - } - j = cntNew[i]; /* If the sibling page assembled above was not the right-most sibling, ** insert a divider cell into the parent page. */ - if( i<nNew-1 && j<nCell ){ + assert( i<nNew-1 || j==nCell ); + if( j<nCell ){ u8 *pCell; u8 *pTemp; int sz; @@ -5411,17 +6027,9 @@ static int balance_nonroot(MemPage *pPage){ assert( j<nMaxCells ); pCell = apCell[j]; sz = szCell[j] + leafCorrection; - pTemp = &aSpace2[iSpace2]; + pTemp = &aOvflSpace[iOvflSpace]; if( !pNew->leaf ){ memcpy(&pNew->aData[8], pCell, 4); - if( ISAUTOVACUUM - && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno) - ){ - rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno); - if( rc!=SQLITE_OK ){ - goto balance_cleanup; - } - } }else if( leafData ){ /* If the tree is a leaf-data tree, and the siblings are leaves, ** then there is no divider cell in apCell[]. Instead, the divider @@ -5430,16 +6038,16 @@ static int balance_nonroot(MemPage *pPage){ */ CellInfo info; j--; - sqlite3BtreeParseCellPtr(pNew, apCell[j], &info); + btreeParseCellPtr(pNew, apCell[j], &info); pCell = pTemp; - fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz); + sz = 4 + putVarint(&pCell[4], info.nKey); pTemp = 0; }else{ pCell -= 4; /* Obscure case for non-leaf-data trees: If the cell at pCell was ** previously stored on a leaf node, and its reported size was 4 ** bytes, then it may actually be smaller than this - ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of + ** (see btreeParseCellPtr(), 4 bytes is the minimum size of ** any cell). But it is important to pass the correct size to ** insertCell(), so reparse the cell now. ** @@ -5452,34 +6060,16 @@ static int balance_nonroot(MemPage *pPage){ sz = cellSizePtr(pParent, pCell); } } - iSpace2 += sz; + iOvflSpace += sz; assert( sz<=pBt->pageSize/4 ); - assert( iSpace2<=pBt->pageSize ); - rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4); + assert( iOvflSpace<=pBt->pageSize ); + insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); if( rc!=SQLITE_OK ) goto balance_cleanup; - put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno); + assert( sqlite3PagerIswriteable(pParent->pDbPage) ); - /* If this is an auto-vacuum database, and not a leaf-data tree, - ** then update the pointer map with an entry for the overflow page - ** that the cell just inserted points to (if any). - */ - if( ISAUTOVACUUM && !leafData ){ - rc = ptrmapPutOvfl(pParent, nxDiv); - if( rc!=SQLITE_OK ){ - goto balance_cleanup; - } - } j++; nxDiv++; } - - /* Set the pointer-map entry for the new sibling page. */ - if( ISAUTOVACUUM ){ - rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno); - if( rc!=SQLITE_OK ){ - goto balance_cleanup; - } - } } assert( j==nCell ); assert( nOld>0 ); @@ -5487,47 +6077,142 @@ static int balance_nonroot(MemPage *pPage){ if( (pageFlags & PTF_LEAF)==0 ){ u8 *zChild = &apCopy[nOld-1]->aData[8]; memcpy(&apNew[nNew-1]->aData[8], zChild, 4); - if( ISAUTOVACUUM ){ - rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno); - if( rc!=SQLITE_OK ){ - goto balance_cleanup; + } + + if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ + /* The root page of the b-tree now contains no cells. The only sibling + ** page is the right-child of the parent. Copy the contents of the + ** child page into the parent, decreasing the overall height of the + ** b-tree structure by one. This is described as the "balance-shallower" + ** sub-algorithm in some documentation. + ** + ** If this is an auto-vacuum database, the call to copyNodeContent() + ** sets all pointer-map entries corresponding to database image pages + ** for which the pointer is stored within the content being copied. + ** + ** The second assert below verifies that the child page is defragmented + ** (it must be, as it was just reconstructed using assemblePage()). This + ** is important if the parent page happens to be page 1 of the database + ** image. */ + assert( nNew==1 ); + assert( apNew[0]->nFree == + (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) + ); + copyNodeContent(apNew[0], pParent, &rc); + freePage(apNew[0], &rc); + }else if( ISAUTOVACUUM ){ + /* Fix the pointer-map entries for all the cells that were shifted around. + ** There are several different types of pointer-map entries that need to + ** be dealt with by this routine. Some of these have been set already, but + ** many have not. The following is a summary: + ** + ** 1) The entries associated with new sibling pages that were not + ** siblings when this function was called. These have already + ** been set. We don't need to worry about old siblings that were + ** moved to the free-list - the freePage() code has taken care + ** of those. + ** + ** 2) The pointer-map entries associated with the first overflow + ** page in any overflow chains used by new divider cells. These + ** have also already been taken care of by the insertCell() code. + ** + ** 3) If the sibling pages are not leaves, then the child pages of + ** cells stored on the sibling pages may need to be updated. + ** + ** 4) If the sibling pages are not internal intkey nodes, then any + ** overflow pages used by these cells may need to be updated + ** (internal intkey nodes never contain pointers to overflow pages). + ** + ** 5) If the sibling pages are not leaves, then the pointer-map + ** entries for the right-child pages of each sibling may need + ** to be updated. + ** + ** Cases 1 and 2 are dealt with above by other code. The next + ** block deals with cases 3 and 4 and the one after that, case 5. Since + ** setting a pointer map entry is a relatively expensive operation, this + ** code only sets pointer map entries for child or overflow pages that have + ** actually moved between pages. */ + MemPage *pNew = apNew[0]; + MemPage *pOld = apCopy[0]; + int nOverflow = pOld->nOverflow; + int iNextOld = pOld->nCell + nOverflow; + int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1); + j = 0; /* Current 'old' sibling page */ + k = 0; /* Current 'new' sibling page */ + for(i=0; i<nCell; i++){ + int isDivider = 0; + while( i==iNextOld ){ + /* Cell i is the cell immediately following the last cell on old + ** sibling page j. If the siblings are not leaf pages of an + ** intkey b-tree, then cell i was a divider cell. */ + pOld = apCopy[++j]; + iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; + if( pOld->nOverflow ){ + nOverflow = pOld->nOverflow; + iOverflow = i + !leafData + pOld->aOvfl[0].idx; + } + isDivider = !leafData; + } + + assert(nOverflow>0 || iOverflow<i ); + assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1); + assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1); + if( i==iOverflow ){ + isDivider = 1; + if( (--nOverflow)>0 ){ + iOverflow++; + } + } + + if( i==cntNew[k] ){ + /* Cell i is the cell immediately following the last cell on new + ** sibling page k. If the siblings are not leaf pages of an + ** intkey b-tree, then cell i is a divider cell. */ + pNew = apNew[++k]; + if( !leafData ) continue; + } + assert( j<nOld ); + assert( k<nNew ); + + /* If the cell was originally divider cell (and is not now) or + ** an overflow cell, or if the cell was located on a different sibling + ** page before the balancing, then the pointer map entries associated + ** with any child or overflow pages need to be updated. */ + if( isDivider || pOld->pgno!=pNew->pgno ){ + if( !leafCorrection ){ + ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); + } + if( szCell[i]>pNew->minLocal ){ + ptrmapPutOvflPtr(pNew, apCell[i], &rc); + } } } - } - if( nxDiv==pParent->nCell+pParent->nOverflow ){ - /* Right-most sibling is the right-most child of pParent */ - put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]); - }else{ - /* Right-most sibling is the left child of the first entry in pParent - ** past the right-most divider entry */ - put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]); - } - /* - ** Reparent children of all cells. - */ - for(i=0; i<nNew; i++){ - rc = reparentChildPages(apNew[i], 0); - if( rc!=SQLITE_OK ) goto balance_cleanup; + if( !leafCorrection ){ + for(i=0; i<nNew; i++){ + u32 key = get4byte(&apNew[i]->aData[8]); + ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); + } + } + +#if 0 + /* The ptrmapCheckPages() contains assert() statements that verify that + ** all pointer map pages are set correctly. This is helpful while + ** debugging. This is usually disabled because a corrupt database may + ** cause an assert() statement to fail. */ + ptrmapCheckPages(apNew, nNew); + ptrmapCheckPages(&pParent, 1); +#endif } - rc = reparentChildPages(pParent, 0); - if( rc!=SQLITE_OK ) goto balance_cleanup; - /* - ** Balance the parent page. Note that the current page (pPage) might - ** have been added to the freelist so it might no longer be initialized. - ** But the parent page will always be initialized. - */ assert( pParent->isInit ); - sqlite3ScratchFree(apCell); - apCell = 0; - rc = balance(pParent, 0); - + TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", + nOld, nNew, nCell)); + /* ** Cleanup before returning. */ balance_cleanup: - sqlite3PageFree(aSpace2); sqlite3ScratchFree(apCell); for(i=0; i<nOld; i++){ releasePage(apOld[i]); @@ -5535,268 +6220,197 @@ balance_cleanup: for(i=0; i<nNew; i++){ releasePage(apNew[i]); } - releasePage(pParent); - TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", - pPage->pgno, nOld, nNew, nCell)); - return rc; -} - -/* -** This routine is called for the root page of a btree when the root -** page contains no cells. This is an opportunity to make the tree -** shallower by one level. -*/ -static int balance_shallower(MemPage *pPage){ - MemPage *pChild; /* The only child page of pPage */ - Pgno pgnoChild; /* Page number for pChild */ - int rc = SQLITE_OK; /* Return code from subprocedures */ - BtShared *pBt; /* The main BTree structure */ - int mxCellPerPage; /* Maximum number of cells per page */ - u8 **apCell; /* All cells from pages being balanced */ - u16 *szCell; /* Local size of all cells */ - assert( pPage->pParent==0 ); - assert( pPage->nCell==0 ); - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - pBt = pPage->pBt; - mxCellPerPage = MX_CELL(pBt); - apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) ); - if( apCell==0 ) return SQLITE_NOMEM; - szCell = (u16*)&apCell[mxCellPerPage]; - if( pPage->leaf ){ - /* The table is completely empty */ - TRACE(("BALANCE: empty table %d\n", pPage->pgno)); - }else{ - /* The root page is empty but has one child. Transfer the - ** information from that one child into the root page if it - ** will fit. This reduces the depth of the tree by one. - ** - ** If the root page is page 1, it has less space available than - ** its child (due to the 100 byte header that occurs at the beginning - ** of the database fle), so it might not be able to hold all of the - ** information currently contained in the child. If this is the - ** case, then do not do the transfer. Leave page 1 empty except - ** for the right-pointer to the child page. The child page becomes - ** the virtual root of the tree. - */ - pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]); - assert( pgnoChild>0 ); - assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) ); - rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0); - if( rc ) goto end_shallow_balance; - if( pPage->pgno==1 ){ - rc = sqlite3BtreeInitPage(pChild, pPage); - if( rc ) goto end_shallow_balance; - assert( pChild->nOverflow==0 ); - if( pChild->nFree>=100 ){ - /* The child information will fit on the root page, so do the - ** copy */ - int i; - zeroPage(pPage, pChild->aData[0]); - for(i=0; i<pChild->nCell; i++){ - apCell[i] = findCell(pChild,i); - szCell[i] = cellSizePtr(pChild, apCell[i]); - } - assemblePage(pPage, pChild->nCell, apCell, szCell); - /* Copy the right-pointer of the child to the parent. */ - put4byte(&pPage->aData[pPage->hdrOffset+8], - get4byte(&pChild->aData[pChild->hdrOffset+8])); - freePage(pChild); - TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno)); - }else{ - /* The child has more information that will fit on the root. - ** The tree is already balanced. Do nothing. */ - TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno)); - } - }else{ - memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize); - pPage->isInit = 0; - pPage->pParent = 0; - rc = sqlite3BtreeInitPage(pPage, 0); - assert( rc==SQLITE_OK ); - freePage(pChild); - TRACE(("BALANCE: transfer child %d into root %d\n", - pChild->pgno, pPage->pgno)); - } - rc = reparentChildPages(pPage, 1); - assert( pPage->nOverflow==0 ); - if( ISAUTOVACUUM ){ - int i; - for(i=0; i<pPage->nCell; i++){ - rc = ptrmapPutOvfl(pPage, i); - if( rc!=SQLITE_OK ){ - goto end_shallow_balance; - } - } - } - releasePage(pChild); - } -end_shallow_balance: - sqlite3_free(apCell); return rc; } /* -** The root page is overfull +** This function is called when the root page of a b-tree structure is +** overfull (has one or more overflow pages). +** +** A new child page is allocated and the contents of the current root +** page, including overflow cells, are copied into the child. The root +** page is then overwritten to make it an empty page with the right-child +** pointer pointing to the new page. ** -** When this happens, Create a new child page and copy the -** contents of the root into the child. Then make the root -** page an empty page with rightChild pointing to the new -** child. Finally, call balance_internal() on the new child -** to cause it to split. +** Before returning, all pointer-map entries corresponding to pages +** that the new child-page now contains pointers to are updated. The +** entry corresponding to the new right-child pointer of the root +** page is also updated. +** +** If successful, *ppChild is set to contain a reference to the child +** page and SQLITE_OK is returned. In this case the caller is required +** to call releasePage() on *ppChild exactly once. If an error occurs, +** an error code is returned and *ppChild is set to 0. */ -static int balance_deeper(MemPage *pPage){ - int rc; /* Return value from subprocedures */ - MemPage *pChild; /* Pointer to a new child page */ - Pgno pgnoChild; /* Page number of the new child page */ - BtShared *pBt; /* The BTree */ - int usableSize; /* Total usable size of a page */ - u8 *data; /* Content of the parent page */ - u8 *cdata; /* Content of the child page */ - int hdr; /* Offset to page header in parent */ - int brk; /* Offset to content of first cell in parent */ +static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ + int rc; /* Return value from subprocedures */ + MemPage *pChild = 0; /* Pointer to a new child page */ + Pgno pgnoChild = 0; /* Page number of the new child page */ + BtShared *pBt = pRoot->pBt; /* The BTree */ - assert( pPage->pParent==0 ); - assert( pPage->nOverflow>0 ); - pBt = pPage->pBt; + assert( pRoot->nOverflow>0 ); assert( sqlite3_mutex_held(pBt->mutex) ); - rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0); - if( rc ) return rc; - assert( sqlite3PagerIswriteable(pChild->pDbPage) ); - usableSize = pBt->usableSize; - data = pPage->aData; - hdr = pPage->hdrOffset; - brk = get2byte(&data[hdr+5]); - cdata = pChild->aData; - memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr); - memcpy(&cdata[brk], &data[brk], usableSize-brk); - if( pChild->isInit ) return SQLITE_CORRUPT; - rc = sqlite3BtreeInitPage(pChild, pPage); - if( rc ) goto balancedeeper_out; - memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0])); - pChild->nOverflow = pPage->nOverflow; - if( pChild->nOverflow ){ - pChild->nFree = 0; - } - assert( pChild->nCell==pPage->nCell ); - zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF); - put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild); - TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno)); - if( ISAUTOVACUUM ){ - int i; - rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno); - if( rc ) goto balancedeeper_out; - for(i=0; i<pChild->nCell; i++){ - rc = ptrmapPutOvfl(pChild, i); - if( rc!=SQLITE_OK ){ - goto balancedeeper_out; - } + + /* Make pRoot, the root page of the b-tree, writable. Allocate a new + ** page that will become the new right-child of pPage. Copy the contents + ** of the node stored on pRoot into the new child page. + */ + rc = sqlite3PagerWrite(pRoot->pDbPage); + if( rc==SQLITE_OK ){ + rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); + copyNodeContent(pRoot, pChild, &rc); + if( ISAUTOVACUUM ){ + ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); } - rc = reparentChildPages(pChild, 1); } - if( rc==SQLITE_OK ){ - rc = balance_nonroot(pChild); + if( rc ){ + *ppChild = 0; + releasePage(pChild); + return rc; } + assert( sqlite3PagerIswriteable(pChild->pDbPage) ); + assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); + assert( pChild->nCell==pRoot->nCell ); -balancedeeper_out: - releasePage(pChild); - return rc; -} + TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); -/* -** Decide if the page pPage needs to be balanced. If balancing is -** required, call the appropriate balancing routine. -*/ -static int balance(MemPage *pPage, int insert){ - int rc = SQLITE_OK; - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - if( pPage->pParent==0 ){ - rc = sqlite3PagerWrite(pPage->pDbPage); - if( rc==SQLITE_OK && pPage->nOverflow>0 ){ - rc = balance_deeper(pPage); - } - if( rc==SQLITE_OK && pPage->nCell==0 ){ - rc = balance_shallower(pPage); - } - }else{ - if( pPage->nOverflow>0 || - (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){ - rc = balance_nonroot(pPage); - } - } - return rc; + /* Copy the overflow cells from pRoot to pChild */ + memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0])); + pChild->nOverflow = pRoot->nOverflow; + + /* Zero the contents of pRoot. Then install pChild as the right-child. */ + zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); + put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); + + *ppChild = pChild; + return SQLITE_OK; } /* -** This routine checks all cursors that point to table pgnoRoot. -** If any of those cursors were opened with wrFlag==0 in a different -** database connection (a database connection that shares the pager -** cache with the current connection) and that other connection -** is not in the ReadUncommmitted state, then this routine returns -** SQLITE_LOCKED. -** -** As well as cursors with wrFlag==0, cursors with wrFlag==1 and -** isIncrblobHandle==1 are also considered 'read' cursors. Incremental -** blob cursors are used for both reading and writing. -** -** When pgnoRoot is the root page of an intkey table, this function is also -** responsible for invalidating incremental blob cursors when the table row -** on which they are opened is deleted or modified. Cursors are invalidated -** according to the following rules: +** The page that pCur currently points to has just been modified in +** some way. This function figures out if this modification means the +** tree needs to be balanced, and if so calls the appropriate balancing +** routine. Balancing routines are: ** -** 1) When BtreeClearTable() is called to completely delete the contents -** of a B-Tree table, pExclude is set to zero and parameter iRow is -** set to non-zero. In this case all incremental blob cursors open -** on the table rooted at pgnoRoot are invalidated. -** -** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to -** modify a table row via an SQL statement, pExclude is set to the -** write cursor used to do the modification and parameter iRow is set -** to the integer row id of the B-Tree entry being modified. Unless -** pExclude is itself an incremental blob cursor, then all incremental -** blob cursors open on row iRow of the B-Tree are invalidated. -** -** 3) If both pExclude and iRow are set to zero, no incremental blob -** cursors are invalidated. +** balance_quick() +** balance_deeper() +** balance_nonroot() */ -static int checkReadLocks( - Btree *pBtree, - Pgno pgnoRoot, - BtCursor *pExclude, - i64 iRow -){ - BtCursor *p; - BtShared *pBt = pBtree->pBt; - sqlite3 *db = pBtree->db; - assert( sqlite3BtreeHoldsMutex(pBtree) ); - for(p=pBt->pCursor; p; p=p->pNext){ - if( p==pExclude ) continue; - if( p->pgnoRoot!=pgnoRoot ) continue; -#ifndef SQLITE_OMIT_INCRBLOB - if( p->isIncrblobHandle && ( - (!pExclude && iRow) - || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow) - )){ - p->eState = CURSOR_INVALID; - } -#endif - if( p->eState!=CURSOR_VALID ) continue; - if( p->wrFlag==0 -#ifndef SQLITE_OMIT_INCRBLOB - || p->isIncrblobHandle +static int balance(BtCursor *pCur){ + int rc = SQLITE_OK; + const int nMin = pCur->pBt->usableSize * 2 / 3; + u8 aBalanceQuickSpace[13]; + u8 *pFree = 0; + + TESTONLY( int balance_quick_called = 0 ); + TESTONLY( int balance_deeper_called = 0 ); + + do { + int iPage = pCur->iPage; + MemPage *pPage = pCur->apPage[iPage]; + + if( iPage==0 ){ + if( pPage->nOverflow ){ + /* The root page of the b-tree is overfull. In this case call the + ** balance_deeper() function to create a new child for the root-page + ** and copy the current contents of the root-page to it. The + ** next iteration of the do-loop will balance the child page. + */ + assert( (balance_deeper_called++)==0 ); + rc = balance_deeper(pPage, &pCur->apPage[1]); + if( rc==SQLITE_OK ){ + pCur->iPage = 1; + pCur->aiIdx[0] = 0; + pCur->aiIdx[1] = 0; + assert( pCur->apPage[1]->nOverflow ); + } + }else{ + break; + } + }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ + break; + }else{ + MemPage * const pParent = pCur->apPage[iPage-1]; + int const iIdx = pCur->aiIdx[iPage-1]; + + rc = sqlite3PagerWrite(pParent->pDbPage); + if( rc==SQLITE_OK ){ +#ifndef SQLITE_OMIT_QUICKBALANCE + if( pPage->hasData + && pPage->nOverflow==1 + && pPage->aOvfl[0].idx==pPage->nCell + && pParent->pgno!=1 + && pParent->nCell==iIdx + ){ + /* Call balance_quick() to create a new sibling of pPage on which + ** to store the overflow cell. balance_quick() inserts a new cell + ** into pParent, which may cause pParent overflow. If this + ** happens, the next interation of the do-loop will balance pParent + ** use either balance_nonroot() or balance_deeper(). Until this + ** happens, the overflow cell is stored in the aBalanceQuickSpace[] + ** buffer. + ** + ** The purpose of the following assert() is to check that only a + ** single call to balance_quick() is made for each call to this + ** function. If this were not verified, a subtle bug involving reuse + ** of the aBalanceQuickSpace[] might sneak in. + */ + assert( (balance_quick_called++)==0 ); + rc = balance_quick(pParent, pPage, aBalanceQuickSpace); + }else #endif - ){ - sqlite3 *dbOther = p->pBtree->db; - if( dbOther==0 || - (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){ - return SQLITE_LOCKED; + { + /* In this case, call balance_nonroot() to redistribute cells + ** between pPage and up to 2 of its sibling pages. This involves + ** modifying the contents of pParent, which may cause pParent to + ** become overfull or underfull. The next iteration of the do-loop + ** will balance the parent page to correct this. + ** + ** If the parent page becomes overfull, the overflow cell or cells + ** are stored in the pSpace buffer allocated immediately below. + ** A subsequent iteration of the do-loop will deal with this by + ** calling balance_nonroot() (balance_deeper() may be called first, + ** but it doesn't deal with overflow cells - just moves them to a + ** different page). Once this subsequent call to balance_nonroot() + ** has completed, it is safe to release the pSpace buffer used by + ** the previous call, as the overflow cell data will have been + ** copied either into the body of a database page or into the new + ** pSpace buffer passed to the latter call to balance_nonroot(). + */ + u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); + rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1); + if( pFree ){ + /* If pFree is not NULL, it points to the pSpace buffer used + ** by a previous call to balance_nonroot(). Its contents are + ** now stored either on real database pages or within the + ** new pSpace buffer, so it may be safely freed here. */ + sqlite3PageFree(pFree); + } + + /* The pSpace buffer will be freed after the next call to + ** balance_nonroot(), or just before this function returns, whichever + ** comes first. */ + pFree = pSpace; + } } + + pPage->nOverflow = 0; + + /* The next iteration of the do-loop balances the parent page. */ + releasePage(pPage); + pCur->iPage--; } + }while( rc==SQLITE_OK ); + + if( pFree ){ + sqlite3PageFree(pFree); } - return SQLITE_OK; + return rc; } + /* ** Insert a new record into the BTree. The key is given by (pKey,nKey) ** and the data is given by (pData,nData). The cursor is used only to @@ -5805,52 +6419,84 @@ static int checkReadLocks( ** ** For an INTKEY table, only the nKey value of the key is used. pKey is ** ignored. For a ZERODATA table, the pData and nData are both ignored. +** +** If the seekResult parameter is non-zero, then a successful call to +** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already +** been performed. seekResult is the search result returned (a negative +** number if pCur points at an entry that is smaller than (pKey, nKey), or +** a positive value if pCur points at an etry that is larger than +** (pKey, nKey)). +** +** If the seekResult parameter is non-zero, then the caller guarantees that +** cursor pCur is pointing at the existing copy of a row that is to be +** overwritten. If the seekResult parameter is 0, then cursor pCur may +** point to any entry or to no entry at all and so this function has to seek +** the cursor before the new key can be inserted. */ int sqlite3BtreeInsert( BtCursor *pCur, /* Insert data into the table of this cursor */ const void *pKey, i64 nKey, /* The key of the new record */ const void *pData, int nData, /* The data of the new record */ int nZero, /* Number of extra 0 bytes to append to data */ - int appendBias /* True if this is likely an append */ + int appendBias, /* True if this is likely an append */ + int seekResult /* Result of prior MovetoUnpacked() call */ ){ int rc; - int loc; + int loc = seekResult; /* -1: before desired location +1: after */ int szNew; + int idx; MemPage *pPage; Btree *p = pCur->pBtree; BtShared *pBt = p->pBt; unsigned char *oldCell; unsigned char *newCell = 0; - assert( cursorHoldsMutex(pCur) ); - if( pBt->inTransaction!=TRANS_WRITE ){ - /* Must start a transaction before doing an insert */ - rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; - return rc; - } - assert( !pBt->readOnly ); - if( !pCur->wrFlag ){ - return SQLITE_PERM; /* Cursor not open for writing */ - } - if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){ - return SQLITE_LOCKED; /* The table pCur points to has a read lock */ - } if( pCur->eState==CURSOR_FAULT ){ - return pCur->skip; + assert( pCur->skipNext!=SQLITE_OK ); + return pCur->skipNext; } - /* Save the positions of any other cursors open on this table */ - clearCursorPosition(pCur); - if( - SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || - SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, 0, nKey, appendBias, &loc)) - ){ - return rc; + assert( cursorHoldsMutex(pCur) ); + assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly ); + assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); + + /* Assert that the caller has been consistent. If this cursor was opened + ** expecting an index b-tree, then the caller should be inserting blob + ** keys with no associated data. If the cursor was opened expecting an + ** intkey table, the caller should be inserting integer keys with a + ** blob of associated data. */ + assert( (pKey==0)==(pCur->pKeyInfo==0) ); + + /* If this is an insert into a table b-tree, invalidate any incrblob + ** cursors open on the row being replaced (assuming this is a replace + ** operation - if it is not, the following is a no-op). */ + if( pCur->pKeyInfo==0 ){ + invalidateIncrblobCursors(p, nKey, 0); } - pPage = pCur->pPage; + /* Save the positions of any other cursors open on this table. + ** + ** In some cases, the call to btreeMoveto() below is a no-op. For + ** example, when inserting data into a table with auto-generated integer + ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the + ** integer key to use. It then calls this function to actually insert the + ** data into the intkey B-Tree. In this case btreeMoveto() recognizes + ** that the cursor is already where it needs to be and returns without + ** doing any work. To avoid thwarting these optimizations, it is important + ** not to clear the cursor here. + */ + rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); + if( rc ) return rc; + if( !loc ){ + rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc); + if( rc ) return rc; + } + assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) ); + + pPage = pCur->apPage[pCur->iPage]; assert( pPage->intKey || nKey>=0 ); assert( pPage->leaf || !pPage->intKey ); + TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", pCur->pgnoRoot, nKey, nData, pPage->pgno, loc==0 ? "overwrite" : "new entry")); @@ -5862,148 +6508,179 @@ int sqlite3BtreeInsert( if( rc ) goto end_insert; assert( szNew==cellSizePtr(pPage, newCell) ); assert( szNew<=MX_CELL_SIZE(pBt) ); - if( loc==0 && CURSOR_VALID==pCur->eState ){ + idx = pCur->aiIdx[pCur->iPage]; + if( loc==0 ){ u16 szOld; - assert( pCur->idx>=0 && pCur->idx<pPage->nCell ); + assert( idx<pPage->nCell ); rc = sqlite3PagerWrite(pPage->pDbPage); if( rc ){ goto end_insert; } - oldCell = findCell(pPage, pCur->idx); + oldCell = findCell(pPage, idx); if( !pPage->leaf ){ memcpy(newCell, oldCell, 4); } szOld = cellSizePtr(pPage, oldCell); rc = clearCell(pPage, oldCell); + dropCell(pPage, idx, szOld, &rc); if( rc ) goto end_insert; - dropCell(pPage, pCur->idx, szOld); }else if( loc<0 && pPage->nCell>0 ){ assert( pPage->leaf ); - pCur->idx++; - pCur->info.nSize = 0; - pCur->validNKey = 0; + idx = ++pCur->aiIdx[pCur->iPage]; }else{ assert( pPage->leaf ); } - rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0); - if( rc!=SQLITE_OK ) goto end_insert; - rc = balance(pPage, 1); - if( rc==SQLITE_OK ){ - moveToRoot(pCur); + insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); + assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); + + /* If no error has occured and pPage has an overflow cell, call balance() + ** to redistribute the cells within the tree. Since balance() may move + ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey + ** variables. + ** + ** Previous versions of SQLite called moveToRoot() to move the cursor + ** back to the root page as balance() used to invalidate the contents + ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, + ** set the cursor state to "invalid". This makes common insert operations + ** slightly faster. + ** + ** There is a subtle but important optimization here too. When inserting + ** multiple records into an intkey b-tree using a single cursor (as can + ** happen while processing an "INSERT INTO ... SELECT" statement), it + ** is advantageous to leave the cursor pointing to the last entry in + ** the b-tree if possible. If the cursor is left pointing to the last + ** entry in the table, and the next row inserted has an integer key + ** larger than the largest existing key, it is possible to insert the + ** row without seeking the cursor. This can be a big performance boost. + */ + pCur->info.nSize = 0; + pCur->validNKey = 0; + if( rc==SQLITE_OK && pPage->nOverflow ){ + rc = balance(pCur); + + /* Must make sure nOverflow is reset to zero even if the balance() + ** fails. Internal data structure corruption will result otherwise. + ** Also, set the cursor state to invalid. This stops saveCursorPosition() + ** from trying to save the current position of the cursor. */ + pCur->apPage[pCur->iPage]->nOverflow = 0; + pCur->eState = CURSOR_INVALID; } + assert( pCur->apPage[pCur->iPage]->nOverflow==0 ); + end_insert: return rc; } /* ** Delete the entry that the cursor is pointing to. The cursor -** is left pointing at a random location. +** is left pointing at a arbitrary location. */ int sqlite3BtreeDelete(BtCursor *pCur){ - MemPage *pPage = pCur->pPage; - unsigned char *pCell; - int rc; - Pgno pgnoChild = 0; Btree *p = pCur->pBtree; - BtShared *pBt = p->pBt; + BtShared *pBt = p->pBt; + int rc; /* Return code */ + MemPage *pPage; /* Page to delete cell from */ + unsigned char *pCell; /* Pointer to cell to delete */ + int iCellIdx; /* Index of cell to delete */ + int iCellDepth; /* Depth of node containing pCell */ assert( cursorHoldsMutex(pCur) ); - assert( pPage->isInit ); - if( pBt->inTransaction!=TRANS_WRITE ){ - /* Must start a transaction before doing a delete */ - rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; - return rc; - } + assert( pBt->inTransaction==TRANS_WRITE ); assert( !pBt->readOnly ); - if( pCur->eState==CURSOR_FAULT ){ - return pCur->skip; - } - if( pCur->idx >= pPage->nCell ){ - return SQLITE_ERROR; /* The cursor is not pointing to anything */ - } - if( !pCur->wrFlag ){ - return SQLITE_PERM; /* Did not open this cursor for writing */ - } - if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){ - return SQLITE_LOCKED; /* The table pCur points to has a read lock */ - } + assert( pCur->wrFlag ); + assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); + assert( !hasReadConflicts(p, pCur->pgnoRoot) ); - /* Restore the current cursor position (a no-op if the cursor is not in - ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors - ** open on the same table. Then call sqlite3PagerWrite() on the page - ** that the entry will be deleted from. - */ - if( - (rc = restoreCursorPosition(pCur))!=0 || - (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 || - (rc = sqlite3PagerWrite(pPage->pDbPage))!=0 + if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell) + || NEVER(pCur->eState!=CURSOR_VALID) ){ - return rc; + return SQLITE_ERROR; /* Something has gone awry. */ } - /* Locate the cell within its page and leave pCell pointing to the - ** data. The clearCell() call frees any overflow pages associated with the - ** cell. The cell itself is still intact. - */ - pCell = findCell(pPage, pCur->idx); + /* If this is a delete operation to remove a row from a table b-tree, + ** invalidate any incrblob cursors open on the row being deleted. */ + if( pCur->pKeyInfo==0 ){ + invalidateIncrblobCursors(p, pCur->info.nKey, 0); + } + + iCellDepth = pCur->iPage; + iCellIdx = pCur->aiIdx[iCellDepth]; + pPage = pCur->apPage[iCellDepth]; + pCell = findCell(pPage, iCellIdx); + + /* If the page containing the entry to delete is not a leaf page, move + ** the cursor to the largest entry in the tree that is smaller than + ** the entry being deleted. This cell will replace the cell being deleted + ** from the internal node. The 'previous' entry is used for this instead + ** of the 'next' entry, as the previous entry is always a part of the + ** sub-tree headed by the child page of the cell being deleted. This makes + ** balancing the tree following the delete operation easier. */ if( !pPage->leaf ){ - pgnoChild = get4byte(pCell); + int notUsed; + rc = sqlite3BtreePrevious(pCur, ¬Used); + if( rc ) return rc; } + + /* Save the positions of any other cursors open on this table before + ** making any modifications. Make the page containing the entry to be + ** deleted writable. Then free any overflow pages associated with the + ** entry and finally remove the cell itself from within the page. + */ + rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); + if( rc ) return rc; + rc = sqlite3PagerWrite(pPage->pDbPage); + if( rc ) return rc; rc = clearCell(pPage, pCell); - if( rc ){ - return rc; - } + dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc); + if( rc ) return rc; + /* If the cell deleted was not located on a leaf page, then the cursor + ** is currently pointing to the largest entry in the sub-tree headed + ** by the child-page of the cell that was just deleted from an internal + ** node. The cell from the leaf node needs to be moved to the internal + ** node to replace the deleted cell. */ if( !pPage->leaf ){ - /* - ** The entry we are about to delete is not a leaf so if we do not - ** do something we will leave a hole on an internal page. - ** We have to fill the hole by moving in a cell from a leaf. The - ** next Cell after the one to be deleted is guaranteed to exist and - ** to be a leaf so we can use it. - */ - BtCursor leafCur; - unsigned char *pNext; - int notUsed; - unsigned char *tempCell = 0; - assert( !pPage->intKey ); - sqlite3BtreeGetTempCursor(pCur, &leafCur); - rc = sqlite3BtreeNext(&leafCur, ¬Used); - if( rc==SQLITE_OK ){ - rc = sqlite3PagerWrite(leafCur.pPage->pDbPage); - } - if( rc==SQLITE_OK ){ - u16 szNext; - TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n", - pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno)); - dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); - pNext = findCell(leafCur.pPage, leafCur.idx); - szNext = cellSizePtr(leafCur.pPage, pNext); - assert( MX_CELL_SIZE(pBt)>=szNext+4 ); - allocateTempSpace(pBt); - tempCell = pBt->pTmpSpace; - if( tempCell==0 ){ - rc = SQLITE_NOMEM; - } - if( rc==SQLITE_OK ){ - rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0); - } - if( rc==SQLITE_OK ){ - put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild); - rc = balance(pPage, 0); - } - if( rc==SQLITE_OK ){ - dropCell(leafCur.pPage, leafCur.idx, szNext); - rc = balance(leafCur.pPage, 0); - } - } - sqlite3BtreeReleaseTempCursor(&leafCur); - }else{ - TRACE(("DELETE: table=%d delete from leaf %d\n", - pCur->pgnoRoot, pPage->pgno)); - dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell)); - rc = balance(pPage, 0); + MemPage *pLeaf = pCur->apPage[pCur->iPage]; + int nCell; + Pgno n = pCur->apPage[iCellDepth+1]->pgno; + unsigned char *pTmp; + + pCell = findCell(pLeaf, pLeaf->nCell-1); + nCell = cellSizePtr(pLeaf, pCell); + assert( MX_CELL_SIZE(pBt)>=nCell ); + + allocateTempSpace(pBt); + pTmp = pBt->pTmpSpace; + + rc = sqlite3PagerWrite(pLeaf->pDbPage); + insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); + dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); + if( rc ) return rc; } + + /* Balance the tree. If the entry deleted was located on a leaf page, + ** then the cursor still points to that page. In this case the first + ** call to balance() repairs the tree, and the if(...) condition is + ** never true. + ** + ** Otherwise, if the entry deleted was on an internal node page, then + ** pCur is pointing to the leaf page from which a cell was removed to + ** replace the cell deleted from the internal node. This is slightly + ** tricky as the leaf node may be underfull, and the internal node may + ** be either under or overfull. In this case run the balancing algorithm + ** on the leaf node first. If the balance proceeds far enough up the + ** tree that we can be sure that any problem in the internal node has + ** been corrected, so be it. Otherwise, after balancing the leaf node, + ** walk the cursor up the tree to the internal node and balance it as + ** well. */ + rc = balance(pCur); + if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ + while( pCur->iPage>iCellDepth ){ + releasePage(pCur->apPage[pCur->iPage--]); + } + rc = balance(pCur); + } + if( rc==SQLITE_OK ){ moveToRoot(pCur); } @@ -6028,11 +6705,7 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){ int rc; assert( sqlite3BtreeHoldsMutex(p) ); - if( pBt->inTransaction!=TRANS_WRITE ){ - /* Must start a transaction first */ - rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; - return rc; - } + assert( pBt->inTransaction==TRANS_WRITE ); assert( !pBt->readOnly ); #ifdef SQLITE_OMIT_AUTOVACUUM @@ -6056,10 +6729,7 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){ ** root page of the new table should go. meta[3] is the largest root-page ** created so far, so the new root-page is (meta[3]+1). */ - rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot); - if( rc!=SQLITE_OK ){ - return rc; - } + sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); pgnoRoot++; /* The new root-page may not be allocated on a pointer-map page, or the @@ -6087,28 +6757,26 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){ ** by extending the file), the current page at position pgnoMove ** is already journaled. */ - u8 eType; - Pgno iPtrPage; + u8 eType = 0; + Pgno iPtrPage = 0; releasePage(pPageMove); /* Move the page currently at pgnoRoot to pgnoMove. */ - rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); + rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); if( rc!=SQLITE_OK ){ return rc; } rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); - if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ - releasePage(pRoot); - return rc; + if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ + rc = SQLITE_CORRUPT_BKPT; } - assert( eType!=PTRMAP_ROOTPAGE ); - assert( eType!=PTRMAP_FREEPAGE ); - rc = sqlite3PagerWrite(pRoot->pDbPage); if( rc!=SQLITE_OK ){ releasePage(pRoot); return rc; } + assert( eType!=PTRMAP_ROOTPAGE ); + assert( eType!=PTRMAP_FREEPAGE ); rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); releasePage(pRoot); @@ -6116,7 +6784,7 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){ if( rc!=SQLITE_OK ){ return rc; } - rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0); + rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -6130,7 +6798,7 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){ } /* Update the pointer-map and meta-data with the new root-page number. */ - rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0); + ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); if( rc ){ releasePage(pRoot); return rc; @@ -6155,7 +6823,6 @@ static int btreeCreateTable(Btree *p, int *piTable, int flags){ int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ int rc; sqlite3BtreeEnter(p); - p->pBt->db = p->db; rc = btreeCreateTable(p, piTable, flags); sqlite3BtreeLeave(p); return rc; @@ -6168,36 +6835,39 @@ int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ static int clearDatabasePage( BtShared *pBt, /* The BTree that contains the table */ Pgno pgno, /* Page number to clear */ - MemPage *pParent, /* Parent page. NULL for the root */ - int freePageFlag /* Deallocate page if true */ + int freePageFlag, /* Deallocate page if true */ + int *pnChange ){ - MemPage *pPage = 0; + MemPage *pPage; int rc; unsigned char *pCell; int i; assert( sqlite3_mutex_held(pBt->mutex) ); - if( pgno>pagerPagecount(pBt->pPager) ){ + if( pgno>pagerPagecount(pBt) ){ return SQLITE_CORRUPT_BKPT; } - rc = getAndInitPage(pBt, pgno, &pPage, pParent); - if( rc ) goto cleardatabasepage_out; + rc = getAndInitPage(pBt, pgno, &pPage); + if( rc ) return rc; for(i=0; i<pPage->nCell; i++){ pCell = findCell(pPage, i); if( !pPage->leaf ){ - rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1); + rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); if( rc ) goto cleardatabasepage_out; } rc = clearCell(pPage, pCell); if( rc ) goto cleardatabasepage_out; } if( !pPage->leaf ){ - rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1); + rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange); if( rc ) goto cleardatabasepage_out; + }else if( pnChange ){ + assert( pPage->intKey ); + *pnChange += pPage->nCell; } if( freePageFlag ){ - rc = freePage(pPage); + freePage(pPage, &rc); }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ zeroPage(pPage, pPage->aData[0] | PTF_LEAF); } @@ -6215,20 +6885,25 @@ cleardatabasepage_out: ** This routine will fail with SQLITE_LOCKED if there are any open ** read cursors on the table. Open write cursors are moved to the ** root of the table. +** +** If pnChange is not NULL, then table iTable must be an intkey table. The +** integer value pointed to by pnChange is incremented by the number of +** entries in the table. */ -int sqlite3BtreeClearTable(Btree *p, int iTable){ +int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ int rc; BtShared *pBt = p->pBt; sqlite3BtreeEnter(p); - pBt->db = p->db; - if( p->inTrans!=TRANS_WRITE ){ - rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; - }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){ - /* nothing to do */ - }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){ - /* nothing to do */ - }else{ - rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0); + assert( p->inTrans==TRANS_WRITE ); + + /* Invalidate all incrblob cursors open on table iTable (assuming iTable + ** is the root of a table b-tree - if it is not, the following call is + ** a no-op). */ + invalidateIncrblobCursors(p, 0, 1); + + rc = saveAllCursors(pBt, (Pgno)iTable, 0); + if( SQLITE_OK==rc ){ + rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); } sqlite3BtreeLeave(p); return rc; @@ -6254,29 +6929,30 @@ int sqlite3BtreeClearTable(Btree *p, int iTable){ ** The last root page is recorded in meta[3] and the value of ** meta[3] is updated by this procedure. */ -static int btreeDropTable(Btree *p, int iTable, int *piMoved){ +static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ int rc; MemPage *pPage = 0; BtShared *pBt = p->pBt; assert( sqlite3BtreeHoldsMutex(p) ); - if( p->inTrans!=TRANS_WRITE ){ - return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; - } + assert( p->inTrans==TRANS_WRITE ); /* It is illegal to drop a table if any cursors are open on the ** database. This is because in auto-vacuum mode the backend may ** need to move another root-page to fill a gap left by the deleted ** root page. If an open cursor was using this page a problem would ** occur. + ** + ** This error is caught long before control reaches this point. */ - if( pBt->pCursor ){ - return SQLITE_LOCKED; + if( NEVER(pBt->pCursor) ){ + sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db); + return SQLITE_LOCKED_SHAREDCACHE; } - rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0); + rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); if( rc ) return rc; - rc = sqlite3BtreeClearTable(p, iTable); + rc = sqlite3BtreeClearTable(p, iTable, 0); if( rc ){ releasePage(pPage); return rc; @@ -6286,22 +6962,18 @@ static int btreeDropTable(Btree *p, int iTable, int *piMoved){ if( iTable>1 ){ #ifdef SQLITE_OMIT_AUTOVACUUM - rc = freePage(pPage); + freePage(pPage, &rc); releasePage(pPage); #else if( pBt->autoVacuum ){ Pgno maxRootPgno; - rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno); - if( rc!=SQLITE_OK ){ - releasePage(pPage); - return rc; - } + sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); if( iTable==maxRootPgno ){ /* If the table being dropped is the table with the largest root-page ** number in the database, put the root page on the free list. */ - rc = freePage(pPage); + freePage(pPage, &rc); releasePage(pPage); if( rc!=SQLITE_OK ){ return rc; @@ -6313,7 +6985,7 @@ static int btreeDropTable(Btree *p, int iTable, int *piMoved){ */ MemPage *pMove; releasePage(pPage); - rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); + rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); if( rc!=SQLITE_OK ){ return rc; } @@ -6322,11 +6994,9 @@ static int btreeDropTable(Btree *p, int iTable, int *piMoved){ if( rc!=SQLITE_OK ){ return rc; } - rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0); - if( rc!=SQLITE_OK ){ - return rc; - } - rc = freePage(pMove); + pMove = 0; + rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); + freePage(pMove, &rc); releasePage(pMove); if( rc!=SQLITE_OK ){ return rc; @@ -6340,22 +7010,23 @@ static int btreeDropTable(Btree *p, int iTable, int *piMoved){ ** PENDING_BYTE_PAGE. */ maxRootPgno--; - if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){ - maxRootPgno--; - } - if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){ + while( maxRootPgno==PENDING_BYTE_PAGE(pBt) + || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ maxRootPgno--; } assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); }else{ - rc = freePage(pPage); + freePage(pPage, &rc); releasePage(pPage); } #endif }else{ - /* If sqlite3BtreeDropTable was called on page 1. */ + /* If sqlite3BtreeDropTable was called on page 1. + ** This really never should happen except in a corrupt + ** database. + */ zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); releasePage(pPage); } @@ -6364,7 +7035,6 @@ static int btreeDropTable(Btree *p, int iTable, int *piMoved){ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ int rc; sqlite3BtreeEnter(p); - p->pBt->db = p->db; rc = btreeDropTable(p, iTable, piMoved); sqlite3BtreeLeave(p); return rc; @@ -6372,6 +7042,9 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ /* +** This function may only be called if the b-tree connection already +** has a read or write transaction open on the database. +** ** Read the meta-information out of a database file. Meta[0] ** is the number of free pages currently in the database. Meta[1] ** through meta[15] are available for use by higher layers. Meta[0] @@ -6381,47 +7054,24 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ ** layer (and the SetCookie and ReadCookie opcodes) the number of ** free pages is not visible. So Cookie[0] is the same as Meta[1]. */ -int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ - DbPage *pDbPage; - int rc; - unsigned char *pP1; +void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ BtShared *pBt = p->pBt; sqlite3BtreeEnter(p); - pBt->db = p->db; - - /* Reading a meta-data value requires a read-lock on page 1 (and hence - ** the sqlite_master table. We grab this lock regardless of whether or - ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page - ** 1 is treated as a special case by queryTableLock() and lockTable()). - */ - rc = queryTableLock(p, 1, READ_LOCK); - if( rc!=SQLITE_OK ){ - sqlite3BtreeLeave(p); - return rc; - } - + assert( p->inTrans>TRANS_NONE ); + assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) ); + assert( pBt->pPage1 ); assert( idx>=0 && idx<=15 ); - rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage); - if( rc ){ - sqlite3BtreeLeave(p); - return rc; - } - pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage); - *pMeta = get4byte(&pP1[36 + idx*4]); - sqlite3PagerUnref(pDbPage); - /* If autovacuumed is disabled in this build but we are trying to - ** access an autovacuumed database, then make the database readonly. - */ + *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); + + /* If auto-vacuum is disabled in this build and this is an auto-vacuum + ** database, mark the database as read-only. */ #ifdef SQLITE_OMIT_AUTOVACUUM - if( idx==4 && *pMeta>0 ) pBt->readOnly = 1; + if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1; #endif - /* Grab the read-lock on page 1. */ - rc = lockTable(p, 1, READ_LOCK); sqlite3BtreeLeave(p); - return rc; } /* @@ -6434,44 +7084,93 @@ int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ int rc; assert( idx>=1 && idx<=15 ); sqlite3BtreeEnter(p); - pBt->db = p->db; - if( p->inTrans!=TRANS_WRITE ){ - rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; - }else{ - assert( pBt->pPage1!=0 ); - pP1 = pBt->pPage1->aData; - rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); - if( rc==SQLITE_OK ){ - put4byte(&pP1[36 + idx*4], iMeta); + assert( p->inTrans==TRANS_WRITE ); + assert( pBt->pPage1!=0 ); + pP1 = pBt->pPage1->aData; + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); + if( rc==SQLITE_OK ){ + put4byte(&pP1[36 + idx*4], iMeta); #ifndef SQLITE_OMIT_AUTOVACUUM - if( idx==7 ){ - assert( pBt->autoVacuum || iMeta==0 ); - assert( iMeta==0 || iMeta==1 ); - pBt->incrVacuum = iMeta; - } -#endif + if( idx==BTREE_INCR_VACUUM ){ + assert( pBt->autoVacuum || iMeta==0 ); + assert( iMeta==0 || iMeta==1 ); + pBt->incrVacuum = (u8)iMeta; } +#endif } sqlite3BtreeLeave(p); return rc; } +#ifndef SQLITE_OMIT_BTREECOUNT /* -** Return the flag byte at the beginning of the page that the cursor -** is currently pointing to. +** The first argument, pCur, is a cursor opened on some b-tree. Count the +** number of entries in the b-tree and write the result to *pnEntry. +** +** SQLITE_OK is returned if the operation is successfully executed. +** Otherwise, if an error is encountered (i.e. an IO error or database +** corruption) an SQLite error code is returned. */ -int sqlite3BtreeFlags(BtCursor *pCur){ - /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call - ** restoreCursorPosition() here. +int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ + i64 nEntry = 0; /* Value to return in *pnEntry */ + int rc; /* Return code */ + rc = moveToRoot(pCur); + + /* Unless an error occurs, the following loop runs one iteration for each + ** page in the B-Tree structure (not including overflow pages). */ - MemPage *pPage; - restoreCursorPosition(pCur); - pPage = pCur->pPage; - assert( cursorHoldsMutex(pCur) ); - assert( pPage->pBt==pCur->pBt ); - return pPage ? pPage->aData[pPage->hdrOffset] : 0; -} + while( rc==SQLITE_OK ){ + int iIdx; /* Index of child node in parent */ + MemPage *pPage; /* Current page of the b-tree */ + + /* If this is a leaf page or the tree is not an int-key tree, then + ** this page contains countable entries. Increment the entry counter + ** accordingly. + */ + pPage = pCur->apPage[pCur->iPage]; + if( pPage->leaf || !pPage->intKey ){ + nEntry += pPage->nCell; + } + /* pPage is a leaf node. This loop navigates the cursor so that it + ** points to the first interior cell that it points to the parent of + ** the next page in the tree that has not yet been visited. The + ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell + ** of the page, or to the number of cells in the page if the next page + ** to visit is the right-child of its parent. + ** + ** If all pages in the tree have been visited, return SQLITE_OK to the + ** caller. + */ + if( pPage->leaf ){ + do { + if( pCur->iPage==0 ){ + /* All pages of the b-tree have been visited. Return successfully. */ + *pnEntry = nEntry; + return SQLITE_OK; + } + moveToParent(pCur); + }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); + + pCur->aiIdx[pCur->iPage]++; + pPage = pCur->apPage[pCur->iPage]; + } + + /* Descend to the child node of the cell that the cursor currently + ** points at. This is the right-child if (iIdx==pPage->nCell). + */ + iIdx = pCur->aiIdx[pCur->iPage]; + if( iIdx==pPage->nCell ){ + rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); + }else{ + rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); + } + } + + /* An error has occurred. Return an error code. */ + return rc; +} +#endif /* ** Return the pager associated with a BTree. This routine is used for @@ -6519,9 +7218,9 @@ static void checkAppendMsg( ** ** Also check that the page number is in bounds. */ -static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){ +static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){ if( iPage==0 ) return 1; - if( iPage>pCheck->nPage || iPage<0 ){ + if( iPage>pCheck->nPage ){ checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); return 1; } @@ -6551,6 +7250,7 @@ static void checkPtrmap( rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); if( rc!=SQLITE_OK ){ + if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); return; } @@ -6656,7 +7356,6 @@ static void checkList( static int checkTreePage( IntegrityCk *pCheck, /* Context for the sanity check */ int iPage, /* Page number of the page to check */ - MemPage *pParent, /* Parent page */ char *zParentContext /* Parent context */ ){ MemPage *pPage; @@ -6667,7 +7366,7 @@ static int checkTreePage( BtShared *pBt; int usableSize; char zContext[100]; - char *hit; + char *hit = 0; sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); @@ -6677,14 +7376,19 @@ static int checkTreePage( usableSize = pBt->usableSize; if( iPage==0 ) return 0; if( checkRef(pCheck, iPage, zParentContext) ) return 0; - if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ + if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ checkAppendMsg(pCheck, zContext, "unable to get the page. error code=%d", rc); return 0; } - if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){ + + /* Clear MemPage.isInit to make sure the corruption detection code in + ** btreeInitPage() is executed. */ + pPage->isInit = 0; + if( (rc = btreeInitPage(pPage))!=0 ){ + assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ checkAppendMsg(pCheck, zContext, - "sqlite3BtreeInitPage() returns error code %d", rc); + "btreeInitPage() returns error code %d", rc); releasePage(pPage); return 0; } @@ -6694,7 +7398,7 @@ static int checkTreePage( depth = 0; for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ u8 *pCell; - int sz; + u32 sz; CellInfo info; /* Check payload overflow pages @@ -6702,11 +7406,13 @@ static int checkTreePage( sqlite3_snprintf(sizeof(zContext), zContext, "On tree page %d cell %d: ", iPage, i); pCell = findCell(pPage,i); - sqlite3BtreeParseCellPtr(pPage, pCell, &info); + btreeParseCellPtr(pPage, pCell, &info); sz = info.nData; - if( !pPage->intKey ) sz += info.nKey; + if( !pPage->intKey ) sz += (int)info.nKey; assert( sz==info.nPayload ); - if( sz>info.nLocal ){ + if( (sz>info.nLocal) + && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) + ){ int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); #ifndef SQLITE_OMIT_AUTOVACUUM @@ -6726,7 +7432,7 @@ static int checkTreePage( checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); } #endif - d2 = checkTreePage(pCheck,pgno,pPage,zContext); + d2 = checkTreePage(pCheck, pgno, zContext); if( i>0 && d2!=depth ){ checkAppendMsg(pCheck, zContext, "Child page depth differs"); } @@ -6742,7 +7448,7 @@ static int checkTreePage( checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0); } #endif - checkTreePage(pCheck, pgno, pPage, zContext); + checkTreePage(pCheck, pgno, zContext); } /* Check for complete coverage of the page @@ -6753,32 +7459,37 @@ static int checkTreePage( if( hit==0 ){ pCheck->mallocFailed = 1; }else{ - memset(hit, 0, usableSize ); - memset(hit, 1, get2byte(&data[hdr+5])); + u16 contentOffset = get2byte(&data[hdr+5]); + assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ + memset(hit+contentOffset, 0, usableSize-contentOffset); + memset(hit, 1, contentOffset); nCell = get2byte(&data[hdr+3]); cellStart = hdr + 12 - 4*pPage->leaf; for(i=0; i<nCell; i++){ int pc = get2byte(&data[cellStart+i*2]); - u16 size = cellSizePtr(pPage, &data[pc]); + u16 size = 1024; int j; - if( (pc+size-1)>=usableSize || pc<0 ){ + if( pc<=usableSize-4 ){ + size = cellSizePtr(pPage, &data[pc]); + } + if( (pc+size-1)>=usableSize ){ checkAppendMsg(pCheck, 0, "Corruption detected in cell %d on page %d",i,iPage,0); }else{ for(j=pc+size-1; j>=pc; j--) hit[j]++; } } - for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; - cnt++){ - int size = get2byte(&data[i+2]); - int j; - if( (i+size-1)>=usableSize || i<0 ){ - checkAppendMsg(pCheck, 0, - "Corruption detected in cell %d on page %d",i,iPage,0); - }else{ - for(j=i+size-1; j>=i; j--) hit[j]++; - } - i = get2byte(&data[i]); + i = get2byte(&data[hdr+1]); + while( i>0 ){ + int size, j; + assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */ + size = get2byte(&data[i+2]); + assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ + for(j=i+size-1; j>=i; j--) hit[j]++; + j = get2byte(&data[i]); + assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ + assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ + i = j; } for(i=cnt=0; i<usableSize; i++){ if( hit[i]==0 ){ @@ -6791,12 +7502,11 @@ static int checkTreePage( } if( cnt!=data[hdr+7] ){ checkAppendMsg(pCheck, 0, - "Fragmented space is %d byte reported as %d on page %d", + "Fragmentation of %d bytes reported as %d on page %d", cnt, data[hdr+7], iPage); } } sqlite3PageFree(hit); - releasePage(pPage); return depth+1; } @@ -6808,10 +7518,13 @@ static int checkTreePage( ** an array of pages numbers were each page number is the root page of ** a table. nRoot is the number of entries in aRoot. ** +** A read-only or read-write transaction must be opened before calling +** this function. +** ** Write the number of error seen in *pnErr. Except for some memory -** allocation errors, nn error message is held in memory obtained from +** allocation errors, an error message held in memory obtained from ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is -** returned. +** returned. If a memory allocation error occurs, NULL is returned. */ char *sqlite3BtreeIntegrityCheck( Btree *p, /* The btree to be checked */ @@ -6820,40 +7533,28 @@ char *sqlite3BtreeIntegrityCheck( int mxErr, /* Stop reporting errors after this many */ int *pnErr /* Write number of errors seen to this variable */ ){ - int i; + Pgno i; int nRef; IntegrityCk sCheck; BtShared *pBt = p->pBt; char zErr[100]; sqlite3BtreeEnter(p); - pBt->db = p->db; + assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); nRef = sqlite3PagerRefcount(pBt->pPager); - if( lockBtreeWithRetry(p)!=SQLITE_OK ){ - *pnErr = 1; - sqlite3BtreeLeave(p); - return sqlite3DbStrDup(0, "cannot acquire a read lock on the database"); - } sCheck.pBt = pBt; sCheck.pPager = pBt->pPager; - sCheck.nPage = pagerPagecount(sCheck.pPager); + sCheck.nPage = pagerPagecount(sCheck.pBt); sCheck.mxErr = mxErr; sCheck.nErr = 0; sCheck.mallocFailed = 0; *pnErr = 0; -#ifndef SQLITE_OMIT_AUTOVACUUM - if( pBt->nTrunc!=0 ){ - sCheck.nPage = pBt->nTrunc; - } -#endif if( sCheck.nPage==0 ){ - unlockBtreeIfUnused(pBt); sqlite3BtreeLeave(p); return 0; } sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); if( !sCheck.anRef ){ - unlockBtreeIfUnused(pBt); *pnErr = 1; sqlite3BtreeLeave(p); return 0; @@ -6872,14 +7573,14 @@ char *sqlite3BtreeIntegrityCheck( /* Check all the tables. */ - for(i=0; i<nRoot && sCheck.mxErr; i++){ + for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ if( aRoot[i]==0 ) continue; #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum && aRoot[i]>1 ){ checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); } #endif - checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: "); + checkTreePage(&sCheck, aRoot[i], "List of tree roots: "); } /* Make sure every page in the file is referenced @@ -6904,10 +7605,11 @@ char *sqlite3BtreeIntegrityCheck( #endif } - /* Make sure this analysis did not leave any unref() pages + /* Make sure this analysis did not leave any unref() pages. + ** This is an internal consistency check; an integrity check + ** of the integrity check. */ - unlockBtreeIfUnused(pBt); - if( nRef != sqlite3PagerRefcount(pBt->pPager) ){ + if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ checkAppendMsg(&sCheck, 0, "Outstanding page count goes from %d to %d during this analysis", nRef, sqlite3PagerRefcount(pBt->pPager) @@ -6941,17 +7643,6 @@ const char *sqlite3BtreeGetFilename(Btree *p){ } /* -** Return the pathname of the directory that contains the database file. -** -** The pager directory name is invariant as long as the pager is -** open so it is safe to access without the BtShared mutex. -*/ -const char *sqlite3BtreeGetDirname(Btree *p){ - assert( p->pBt->pPager!=0 ); - return sqlite3PagerDirname(p->pBt->pPager); -} - -/* ** Return the pathname of the journal file for this database. The return ** value of this routine is the same regardless of whether the journal file ** has been created or not. @@ -6964,224 +7655,6 @@ const char *sqlite3BtreeGetJournalname(Btree *p){ return sqlite3PagerJournalname(p->pBt->pPager); } -#ifndef SQLITE_OMIT_VACUUM -/* -** Copy the complete content of pBtFrom into pBtTo. A transaction -** must be active for both files. -** -** The size of file pTo may be reduced by this operation. -** If anything goes wrong, the transaction on pTo is rolled back. -** -** If successful, CommitPhaseOne() may be called on pTo before returning. -** The caller should finish committing the transaction on pTo by calling -** sqlite3BtreeCommit(). -*/ -static int btreeCopyFile(Btree *pTo, Btree *pFrom){ - int rc = SQLITE_OK; - Pgno i; - - Pgno nFromPage; /* Number of pages in pFrom */ - Pgno nToPage; /* Number of pages in pTo */ - Pgno nNewPage; /* Number of pages in pTo after the copy */ - - Pgno iSkip; /* Pending byte page in pTo */ - int nToPageSize; /* Page size of pTo in bytes */ - int nFromPageSize; /* Page size of pFrom in bytes */ - - BtShared *pBtTo = pTo->pBt; - BtShared *pBtFrom = pFrom->pBt; - pBtTo->db = pTo->db; - pBtFrom->db = pFrom->db; - - nToPageSize = pBtTo->pageSize; - nFromPageSize = pBtFrom->pageSize; - - if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){ - return SQLITE_ERROR; - } - if( pBtTo->pCursor ){ - return SQLITE_BUSY; - } - - nToPage = pagerPagecount(pBtTo->pPager); - nFromPage = pagerPagecount(pBtFrom->pPager); - iSkip = PENDING_BYTE_PAGE(pBtTo); - - /* Variable nNewPage is the number of pages required to store the - ** contents of pFrom using the current page-size of pTo. - */ - nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) / - (i64)nToPageSize; - - for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){ - - /* Journal the original page. - ** - ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE) - ** in database *pTo (before the copy). This page is never written - ** into the journal file. Unless i==iSkip or the page was not - ** present in pTo before the copy operation, journal page i from pTo. - */ - if( i!=iSkip && i<=nToPage ){ - DbPage *pDbPage = 0; - rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage); - if( rc==SQLITE_OK ){ - rc = sqlite3PagerWrite(pDbPage); - if( rc==SQLITE_OK && i>nFromPage ){ - /* Yeah. It seems wierd to call DontWrite() right after Write(). But - ** that is because the names of those procedures do not exactly - ** represent what they do. Write() really means "put this page in the - ** rollback journal and mark it as dirty so that it will be written - ** to the database file later." DontWrite() undoes the second part of - ** that and prevents the page from being written to the database. The - ** page is still on the rollback journal, though. And that is the - ** whole point of this block: to put pages on the rollback journal. - */ - sqlite3PagerDontWrite(pDbPage); - } - sqlite3PagerUnref(pDbPage); - } - } - - /* Overwrite the data in page i of the target database */ - if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){ - - DbPage *pToPage = 0; - sqlite3_int64 iOff; - - rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage); - if( rc==SQLITE_OK ){ - rc = sqlite3PagerWrite(pToPage); - } - - for( - iOff=(i-1)*nToPageSize; - rc==SQLITE_OK && iOff<i*nToPageSize; - iOff += nFromPageSize - ){ - DbPage *pFromPage = 0; - Pgno iFrom = (iOff/nFromPageSize)+1; - - if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){ - continue; - } - - rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage); - if( rc==SQLITE_OK ){ - char *zTo = sqlite3PagerGetData(pToPage); - char *zFrom = sqlite3PagerGetData(pFromPage); - int nCopy; - - if( nFromPageSize>=nToPageSize ){ - zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize)); - nCopy = nToPageSize; - }else{ - zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize); - nCopy = nFromPageSize; - } - - memcpy(zTo, zFrom, nCopy); - sqlite3PagerUnref(pFromPage); - } - } - - if( pToPage ) sqlite3PagerUnref(pToPage); - } - } - - /* If things have worked so far, the database file may need to be - ** truncated. The complex part is that it may need to be truncated to - ** a size that is not an integer multiple of nToPageSize - the current - ** page size used by the pager associated with B-Tree pTo. - ** - ** For example, say the page-size of pTo is 2048 bytes and the original - ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024 - ** bytes and 9 pages, then the file needs to be truncated to 9KB. - */ - if( rc==SQLITE_OK ){ - if( nFromPageSize!=nToPageSize ){ - sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager); - i64 iSize = (i64)nFromPageSize * (i64)nFromPage; - i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize; - i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize; - - assert( iSize<=iNow ); - - /* Commit phase one syncs the journal file associated with pTo - ** containing the original data. It does not sync the database file - ** itself. After doing this it is safe to use OsTruncate() and other - ** file APIs on the database file directly. - */ - pBtTo->db = pTo->db; - rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1); - if( iSize<iNow && rc==SQLITE_OK ){ - rc = sqlite3OsTruncate(pFile, iSize); - } - - /* The loop that copied data from database pFrom to pTo did not - ** populate the locking page of database pTo. If the page-size of - ** pFrom is smaller than that of pTo, this means some data will - ** not have been copied. - ** - ** This block copies the missing data from database pFrom to pTo - ** using file APIs. This is safe because at this point we know that - ** all of the original data from pTo has been synced into the - ** journal file. At this point it would be safe to do anything at - ** all to the database file except truncate it to zero bytes. - */ - if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){ - i64 iOff; - for( - iOff=iPending; - rc==SQLITE_OK && iOff<(iPending+nToPageSize); - iOff += nFromPageSize - ){ - DbPage *pFromPage = 0; - Pgno iFrom = (iOff/nFromPageSize)+1; - - if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){ - continue; - } - - rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage); - if( rc==SQLITE_OK ){ - char *zFrom = sqlite3PagerGetData(pFromPage); - rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff); - sqlite3PagerUnref(pFromPage); - } - } - } - - /* Sync the database file */ - if( rc==SQLITE_OK ){ - rc = sqlite3PagerSync(pBtTo->pPager); - } - }else{ - rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage); - } - if( rc==SQLITE_OK ){ - pBtTo->pageSizeFixed = 0; - } - } - - if( rc ){ - sqlite3BtreeRollback(pTo); - } - - return rc; -} -int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){ - int rc; - sqlite3BtreeEnter(pTo); - sqlite3BtreeEnter(pFrom); - rc = btreeCopyFile(pTo, pFrom); - sqlite3BtreeLeave(pFrom); - sqlite3BtreeLeave(pTo); - return rc; -} - -#endif /* SQLITE_OMIT_VACUUM */ - /* ** Return non-zero if a transaction is active. */ @@ -7191,19 +7664,18 @@ int sqlite3BtreeIsInTrans(Btree *p){ } /* -** Return non-zero if a statement transaction is active. -*/ -int sqlite3BtreeIsInStmt(Btree *p){ - assert( sqlite3BtreeHoldsMutex(p) ); - return (p->pBt && p->pBt->inStmt); -} - -/* ** Return non-zero if a read (or write) transaction is active. */ int sqlite3BtreeIsInReadTrans(Btree *p){ + assert( p ); assert( sqlite3_mutex_held(p->db->mutex) ); - return (p && (p->inTrans!=TRANS_NONE)); + return p->inTrans!=TRANS_NONE; +} + +int sqlite3BtreeIsInBackup(Btree *p){ + assert( p ); + assert( sqlite3_mutex_held(p->db->mutex) ); + return p->nBackup!=0; } /* @@ -7238,14 +7710,16 @@ void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ } /* -** Return true if another user of the same shared btree as the argument -** handle holds an exclusive lock on the sqlite_master table. +** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared +** btree as the argument handle holds an exclusive lock on the +** sqlite_master table. Otherwise SQLITE_OK. */ int sqlite3BtreeSchemaLocked(Btree *p){ int rc; assert( sqlite3_mutex_held(p->db->mutex) ); sqlite3BtreeEnter(p); - rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK); + rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); + assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); sqlite3BtreeLeave(p); return rc; } @@ -7259,14 +7733,16 @@ int sqlite3BtreeSchemaLocked(Btree *p){ */ int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ int rc = SQLITE_OK; + assert( p->inTrans!=TRANS_NONE ); if( p->sharable ){ u8 lockType = READ_LOCK + isWriteLock; assert( READ_LOCK+1==WRITE_LOCK ); assert( isWriteLock==0 || isWriteLock==1 ); + sqlite3BtreeEnter(p); - rc = queryTableLock(p, iTab, lockType); + rc = querySharedCacheTableLock(p, iTab, lockType); if( rc==SQLITE_OK ){ - rc = lockTable(p, iTab, lockType); + rc = setSharedCacheTableLock(p, iTab, lockType); } sqlite3BtreeLeave(p); } @@ -7279,38 +7755,43 @@ int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ ** Argument pCsr must be a cursor opened for writing on an ** INTKEY table currently pointing at a valid table entry. ** This function modifies the data stored as part of that entry. -** Only the data content may only be modified, it is not possible -** to change the length of the data stored. +** +** Only the data content may only be modified, it is not possible to +** change the length of the data stored. If this function is called with +** parameters that attempt to write past the end of the existing data, +** no modifications are made and SQLITE_CORRUPT is returned. */ int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ + int rc; assert( cursorHoldsMutex(pCsr) ); assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); - assert(pCsr->isIncrblobHandle); + assert( pCsr->isIncrblobHandle ); - restoreCursorPosition(pCsr); + rc = restoreCursorPosition(pCsr); + if( rc!=SQLITE_OK ){ + return rc; + } assert( pCsr->eState!=CURSOR_REQUIRESEEK ); if( pCsr->eState!=CURSOR_VALID ){ return SQLITE_ABORT; } - /* Check some preconditions: + /* Check some assumptions: ** (a) the cursor is open for writing, - ** (b) there is no read-lock on the table being modified and - ** (c) the cursor points at a valid row of an intKey table. + ** (b) there is a read/write transaction open, + ** (c) the connection holds a write-lock on the table (if required), + ** (d) there are no conflicting read-locks, and + ** (e) the cursor points at a valid row of an intKey table. */ if( !pCsr->wrFlag ){ return SQLITE_READONLY; } - assert( !pCsr->pBt->readOnly - && pCsr->pBt->inTransaction==TRANS_WRITE ); - if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){ - return SQLITE_LOCKED; /* The table pCur points to has a read lock */ - } - if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){ - return SQLITE_ERROR; - } + assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE ); + assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); + assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); + assert( pCsr->apPage[pCsr->iPage]->intKey ); - return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1); + return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); } /* |