diff options
Diffstat (limited to 'third_party/sqlite/ext/fts3/fts3.c')
-rw-r--r-- | third_party/sqlite/ext/fts3/fts3.c | 868 |
1 files changed, 338 insertions, 530 deletions
diff --git a/third_party/sqlite/ext/fts3/fts3.c b/third_party/sqlite/ext/fts3/fts3.c index 922cf3f..a17cf38 100644 --- a/third_party/sqlite/ext/fts3/fts3.c +++ b/third_party/sqlite/ext/fts3/fts3.c @@ -208,8 +208,8 @@ ** ** **** Segment merging **** -** To amortize update costs, segments are groups into levels and -** merged in matches. Each increase in level represents exponentially +** To amortize update costs, segments are grouped into levels and +** merged in batches. Each increase in level represents exponentially ** more documents. ** ** New documents (actually, document updates) are tokenized and @@ -285,6 +285,7 @@ #include <ctype.h> #include "fts3.h" +#include "fts3_expr.h" #include "fts3_hash.h" #include "fts3_tokenizer.h" #ifndef SQLITE_CORE @@ -312,11 +313,6 @@ # define FTSTRACE(A) #endif -/* -** Default span for NEAR operators. -*/ -#define SQLITE_FTS3_DEFAULT_NEAR_PARAM 10 - /* It is not safe to call isspace(), tolower(), or isalnum() on ** hi-bit-set characters. This is the same solution used in the ** tokenizer. @@ -1788,90 +1784,6 @@ static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, /* Forward reference */ typedef struct fulltext_vtab fulltext_vtab; -/* A single term in a query is represented by an instances of -** the following structure. Each word which may match against -** document content is a term. Operators, like NEAR or OR, are -** not terms. Query terms are organized as a flat list stored -** in the Query.pTerms array. -** -** If the QueryTerm.nPhrase variable is non-zero, then the QueryTerm -** is the first in a contiguous string of terms that are either part -** of the same phrase, or connected by the NEAR operator. -** -** If the QueryTerm.nNear variable is non-zero, then the token is followed -** by a NEAR operator with span set to (nNear-1). For example, the -** following query: -** -** The QueryTerm.iPhrase variable stores the index of the token within -** its phrase, indexed starting at 1, or 1 if the token is not part -** of any phrase. -** -** For example, the data structure used to represent the following query: -** -** ... MATCH 'sqlite NEAR/5 google NEAR/2 "search engine"' -** -** is: -** -** {nPhrase=4, iPhrase=1, nNear=6, pTerm="sqlite"}, -** {nPhrase=0, iPhrase=1, nNear=3, pTerm="google"}, -** {nPhrase=0, iPhrase=1, nNear=0, pTerm="search"}, -** {nPhrase=0, iPhrase=2, nNear=0, pTerm="engine"}, -** -** compiling the FTS3 syntax to Query structures is done by the parseQuery() -** function. -*/ -typedef struct QueryTerm { - short int nPhrase; /* How many following terms are part of the same phrase */ - short int iPhrase; /* This is the i-th term of a phrase. */ - short int iColumn; /* Column of the index that must match this term */ - signed char nNear; /* term followed by a NEAR operator with span=(nNear-1) */ - signed char isOr; /* this term is preceded by "OR" */ - signed char isNot; /* this term is preceded by "-" */ - signed char isPrefix; /* this term is followed by "*" */ - char *pTerm; /* text of the term. '\000' terminated. malloced */ - int nTerm; /* Number of bytes in pTerm[] */ -} QueryTerm; - - -/* A query string is parsed into a Query structure. - * - * We could, in theory, allow query strings to be complicated - * nested expressions with precedence determined by parentheses. - * But none of the major search engines do this. (Perhaps the - * feeling is that an parenthesized expression is two complex of - * an idea for the average user to grasp.) Taking our lead from - * the major search engines, we will allow queries to be a list - * of terms (with an implied AND operator) or phrases in double-quotes, - * with a single optional "-" before each non-phrase term to designate - * negation and an optional OR connector. - * - * OR binds more tightly than the implied AND, which is what the - * major search engines seem to do. So, for example: - * - * [one two OR three] ==> one AND (two OR three) - * [one OR two three] ==> (one OR two) AND three - * - * A "-" before a term matches all entries that lack that term. - * The "-" must occur immediately before the term with in intervening - * space. This is how the search engines do it. - * - * A NOT term cannot be the right-hand operand of an OR. If this - * occurs in the query string, the NOT is ignored: - * - * [one OR -two] ==> one OR two - * - */ -typedef struct Query { - fulltext_vtab *pFts; /* The full text index */ - int nTerms; /* Number of terms in the query */ - QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ - int nextIsOr; /* Set the isOr flag on the next inserted term */ - int nextIsNear; /* Set the isOr flag on the next inserted term */ - int nextColumn; /* Next word parsed must be in this column */ - int dfltColumn; /* The default column */ -} Query; - - /* ** An instance of the following structure keeps track of generated ** matching-word offset information and snippets. @@ -2022,14 +1934,14 @@ typedef struct fulltext_cursor { QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ int eof; /* True if at End Of Results */ - Query q; /* Parsed query string */ + Fts3Expr *pExpr; /* Parsed MATCH query string */ Snippet snippet; /* Cached snippet for the current row */ int iColumn; /* Column being searched */ DataBuffer result; /* Doclist results from fulltextQuery */ DLReader reader; /* Result reader if result not empty */ } fulltext_cursor; -static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ +static fulltext_vtab *cursor_vtab(fulltext_cursor *c){ return (fulltext_vtab *) c->base.pVtab; } @@ -3177,18 +3089,6 @@ static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ } } - -/* Free all of the dynamically allocated memory held by *q -*/ -static void queryClear(Query *q){ - int i; - for(i = 0; i < q->nTerms; ++i){ - sqlite3_free(q->pTerms[i].pTerm); - } - sqlite3_free(q->pTerms); - CLEAR(q); -} - /* Free all of the dynamically allocated memory held by the ** Snippet */ @@ -3198,6 +3098,7 @@ static void snippetClear(Snippet *p){ sqlite3_free(p->zSnippet); CLEAR(p); } + /* ** Append a single entry to the p->aMatch[] log. */ @@ -3234,23 +3135,82 @@ static void snippetAppendMatch( #define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1) /* +** Function to iterate through the tokens of a compiled expression. +** +** Except, skip all tokens on the right-hand side of a NOT operator. +** This function is used to find tokens as part of snippet and offset +** generation and we do nt want snippets and offsets to report matches +** for tokens on the RHS of a NOT. +*/ +static int fts3NextExprToken(Fts3Expr **ppExpr, int *piToken){ + Fts3Expr *p = *ppExpr; + int iToken = *piToken; + if( iToken<0 ){ + /* In this case the expression p is the root of an expression tree. + ** Move to the first token in the expression tree. + */ + while( p->pLeft ){ + p = p->pLeft; + } + iToken = 0; + }else{ + assert(p && p->eType==FTSQUERY_PHRASE ); + if( iToken<(p->pPhrase->nToken-1) ){ + iToken++; + }else{ + iToken = 0; + while( p->pParent && p->pParent->pLeft!=p ){ + assert( p->pParent->pRight==p ); + p = p->pParent; + } + p = p->pParent; + if( p ){ + assert( p->pRight!=0 ); + p = p->pRight; + while( p->pLeft ){ + p = p->pLeft; + } + } + } + } + + *ppExpr = p; + *piToken = iToken; + return p?1:0; +} + +/* +** Return TRUE if the expression node pExpr is located beneath the +** RHS of a NOT operator. +*/ +static int fts3ExprBeneathNot(Fts3Expr *p){ + Fts3Expr *pParent; + while( p ){ + pParent = p->pParent; + if( pParent && pParent->eType==FTSQUERY_NOT && pParent->pRight==p ){ + return 1; + } + p = pParent; + } + return 0; +} + +/* ** Add entries to pSnippet->aMatch[] for every match that occurs against ** document zDoc[0..nDoc-1] which is stored in column iColumn. */ static void snippetOffsetsOfColumn( - Query *pQuery, - Snippet *pSnippet, - int iColumn, - const char *zDoc, - int nDoc + fulltext_cursor *pCur, /* The fulltest search cursor */ + Snippet *pSnippet, /* The Snippet object to be filled in */ + int iColumn, /* Index of fulltext table column */ + const char *zDoc, /* Text of the fulltext table column */ + int nDoc /* Length of zDoc in bytes */ ){ const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ fulltext_vtab *pVtab; /* The full text index */ int nColumn; /* Number of columns in the index */ - const QueryTerm *aTerm; /* Query string terms */ - int nTerm; /* Number of query string terms */ int i, j; /* Loop counters */ int rc; /* Return code */ unsigned int match, prevMatch; /* Phrase search bitmasks */ @@ -3264,37 +3224,39 @@ static void snippetOffsetsOfColumn( int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */ int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */ - pVtab = pQuery->pFts; + pVtab = cursor_vtab(pCur); nColumn = pVtab->nColumn; pTokenizer = pVtab->pTokenizer; pTModule = pTokenizer->pModule; rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); if( rc ) return; pTCursor->pTokenizer = pTokenizer; - aTerm = pQuery->pTerms; - nTerm = pQuery->nTerms; - if( nTerm>=FTS3_ROTOR_SZ ){ - nTerm = FTS3_ROTOR_SZ - 1; - } + prevMatch = 0; - while(1){ - rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); - if( rc ) break; + while( !pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos) ){ + Fts3Expr *pIter = pCur->pExpr; + int iIter = -1; iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin; iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin; match = 0; - for(i=0; i<nTerm; i++){ - int iCol; - iCol = aTerm[i].iColumn; + for(i=0; i<(FTS3_ROTOR_SZ-1) && fts3NextExprToken(&pIter, &iIter); i++){ + int nPhrase; /* Number of tokens in current phrase */ + struct PhraseToken *pToken; /* Current token */ + int iCol; /* Column index */ + + if( fts3ExprBeneathNot(pIter) ) continue; + nPhrase = pIter->pPhrase->nToken; + pToken = &pIter->pPhrase->aToken[iIter]; + iCol = pIter->pPhrase->iColumn; if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; - if( aTerm[i].nTerm>nToken ) continue; - if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue; - assert( aTerm[i].nTerm<=nToken ); - if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue; - if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; + if( pToken->n>nToken ) continue; + if( !pToken->isPrefix && pToken->n<nToken ) continue; + assert( pToken->n<=nToken ); + if( memcmp(pToken->z, zToken, pToken->n) ) continue; + if( iIter>0 && (prevMatch & (1<<i))==0 ) continue; match |= 1<<i; - if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ - for(j=aTerm[i].iPhrase-1; j>=0; j--){ + if( i==(FTS3_ROTOR_SZ-2) || nPhrase==iIter+1 ){ + for(j=nPhrase-1; j>=0; j--){ int k = (iRotor-j) & FTS3_ROTOR_MASK; snippetAppendMatch(pSnippet, iColumn, i-j, iPos-j, iRotorBegin[k], iRotorLen[k]); @@ -3324,86 +3286,115 @@ static void snippetOffsetsOfColumn( ** then when this function is called the Snippet contains token offsets ** 0, 4 and 5. This function removes the "0" entry (because the first A ** is not near enough to an E). +** +** When this function is called, the value pointed to by parameter piLeft is +** the integer id of the left-most token in the expression tree headed by +** pExpr. This function increments *piLeft by the total number of tokens +** in the expression tree headed by pExpr. +** +** Return 1 if any trimming occurs. Return 0 if no trimming is required. */ -static void trimSnippetOffsetsForNear(Query *pQuery, Snippet *pSnippet){ - int ii; - int iDir = 1; - - while(iDir>-2) { - assert( iDir==1 || iDir==-1 ); - for(ii=0; ii<pSnippet->nMatch; ii++){ - int jj; - int nNear; - struct snippetMatch *pMatch = &pSnippet->aMatch[ii]; - QueryTerm *pQueryTerm = &pQuery->pTerms[pMatch->iTerm]; - - if( (pMatch->iTerm+iDir)<0 - || (pMatch->iTerm+iDir)>=pQuery->nTerms - ){ - continue; - } - - nNear = pQueryTerm->nNear; - if( iDir<0 ){ - nNear = pQueryTerm[-1].nNear; - } - - if( pMatch->iTerm>=0 && nNear ){ - int isOk = 0; - int iNextTerm = pMatch->iTerm+iDir; - int iPrevTerm = iNextTerm; - - int iEndToken; - int iStartToken; - - if( iDir<0 ){ - int nPhrase = 1; - iStartToken = pMatch->iToken; - while( (pMatch->iTerm+nPhrase)<pQuery->nTerms - && pQuery->pTerms[pMatch->iTerm+nPhrase].iPhrase>1 - ){ - nPhrase++; - } - iEndToken = iStartToken + nPhrase - 1; - }else{ - iEndToken = pMatch->iToken; - iStartToken = pMatch->iToken+1-pQueryTerm->iPhrase; - } +static int trimSnippetOffsets( + Fts3Expr *pExpr, /* The search expression */ + Snippet *pSnippet, /* The set of snippet offsets to be trimmed */ + int *piLeft /* Index of left-most token in pExpr */ +){ + if( pExpr ){ + if( trimSnippetOffsets(pExpr->pLeft, pSnippet, piLeft) ){ + return 1; + } - while( pQuery->pTerms[iNextTerm].iPhrase>1 ){ - iNextTerm--; - } - while( (iPrevTerm+1)<pQuery->nTerms && - pQuery->pTerms[iPrevTerm+1].iPhrase>1 - ){ - iPrevTerm++; + switch( pExpr->eType ){ + case FTSQUERY_PHRASE: + *piLeft += pExpr->pPhrase->nToken; + break; + case FTSQUERY_NEAR: { + /* The right-hand-side of a NEAR operator is always a phrase. The + ** left-hand-side is either a phrase or an expression tree that is + ** itself headed by a NEAR operator. The following initializations + ** set local variable iLeft to the token number of the left-most + ** token in the right-hand phrase, and iRight to the right most + ** token in the same phrase. For example, if we had: + ** + ** <col> MATCH '"abc def" NEAR/2 "ghi jkl"' + ** + ** then iLeft will be set to 2 (token number of ghi) and nToken will + ** be set to 4. + */ + Fts3Expr *pLeft = pExpr->pLeft; + Fts3Expr *pRight = pExpr->pRight; + int iLeft = *piLeft; + int nNear = pExpr->nNear; + int nToken = pRight->pPhrase->nToken; + int jj, ii; + if( pLeft->eType==FTSQUERY_NEAR ){ + pLeft = pLeft->pRight; } - - for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){ - struct snippetMatch *p = &pSnippet->aMatch[jj]; - if( p->iCol==pMatch->iCol && (( - p->iTerm==iNextTerm && - p->iToken>iEndToken && - p->iToken<=iEndToken+nNear - ) || ( - p->iTerm==iPrevTerm && - p->iToken<iStartToken && - p->iToken>=iStartToken-nNear - ))){ - isOk = 1; + assert( pRight->eType==FTSQUERY_PHRASE ); + assert( pLeft->eType==FTSQUERY_PHRASE ); + nToken += pLeft->pPhrase->nToken; + + for(ii=0; ii<pSnippet->nMatch; ii++){ + struct snippetMatch *p = &pSnippet->aMatch[ii]; + if( p->iTerm==iLeft ){ + int isOk = 0; + /* Snippet ii is an occurence of query term iLeft in the document. + ** It occurs at position (p->iToken) of the document. We now + ** search for an instance of token (iLeft-1) somewhere in the + ** range (p->iToken - nNear)...(p->iToken + nNear + nToken) within + ** the set of snippetMatch structures. If one is found, proceed. + ** If one cannot be found, then remove snippets ii..(ii+N-1) + ** from the matching snippets, where N is the number of tokens + ** in phrase pRight->pPhrase. + */ + for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){ + struct snippetMatch *p2 = &pSnippet->aMatch[jj]; + if( p2->iTerm==(iLeft-1) ){ + if( p2->iToken>=(p->iToken-nNear-1) + && p2->iToken<(p->iToken+nNear+nToken) + ){ + isOk = 1; + } + } + } + if( !isOk ){ + int kk; + for(kk=0; kk<pRight->pPhrase->nToken; kk++){ + pSnippet->aMatch[kk+ii].iTerm = -2; + } + return 1; + } } - } - if( !isOk ){ - for(jj=1-pQueryTerm->iPhrase; jj<=0; jj++){ - pMatch[jj].iTerm = -1; + if( p->iTerm==(iLeft-1) ){ + int isOk = 0; + for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){ + struct snippetMatch *p2 = &pSnippet->aMatch[jj]; + if( p2->iTerm==iLeft ){ + if( p2->iToken<=(p->iToken+nNear+1) + && p2->iToken>(p->iToken-nNear-nToken) + ){ + isOk = 1; + } + } + } + if( !isOk ){ + int kk; + for(kk=0; kk<pLeft->pPhrase->nToken; kk++){ + pSnippet->aMatch[ii-kk].iTerm = -2; + } + return 1; + } } - ii = -1; - iDir = 1; } + break; } } - iDir -= 2; + + if( trimSnippetOffsets(pExpr->pRight, pSnippet, piLeft) ){ + return 1; + } } + return 0; } /* @@ -3414,17 +3405,20 @@ static void snippetAllOffsets(fulltext_cursor *p){ int nColumn; int iColumn, i; int iFirst, iLast; - fulltext_vtab *pFts; + int iTerm = 0; + fulltext_vtab *pFts = cursor_vtab(p); - if( p->snippet.nMatch ) return; - if( p->q.nTerms==0 ) return; - pFts = p->q.pFts; + if( p->snippet.nMatch || p->pExpr==0 ){ + return; + } nColumn = pFts->nColumn; iColumn = (p->iCursorType - QUERY_FULLTEXT); if( iColumn<0 || iColumn>=nColumn ){ + /* Look for matches over all columns of the full-text index */ iFirst = 0; iLast = nColumn-1; }else{ + /* Look for matches in the iColumn-th column of the index only */ iFirst = iColumn; iLast = iColumn; } @@ -3433,15 +3427,18 @@ static void snippetAllOffsets(fulltext_cursor *p){ int nDoc; zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); nDoc = sqlite3_column_bytes(p->pStmt, i+1); - snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); + snippetOffsetsOfColumn(p, &p->snippet, i, zDoc, nDoc); } - trimSnippetOffsetsForNear(&p->q, &p->snippet); + while( trimSnippetOffsets(p->pExpr, &p->snippet, &iTerm) ){ + iTerm = 0; + } } /* ** Convert the information in the aMatch[] array of the snippet -** into the string zOffset[0..nOffset-1]. +** into the string zOffset[0..nOffset-1]. This string is used as +** the return of the SQL offsets() function. */ static void snippetOffsetText(Snippet *p){ int i; @@ -3556,7 +3553,7 @@ static void snippetText( aMatch[i].snStatus = SNIPPET_IGNORE; } nDesired = 0; - for(i=0; i<pCursor->q.nTerms; i++){ + for(i=0; i<FTS3_ROTOR_SZ; i++){ for(j=0; j<nMatch; j++){ if( aMatch[j].iTerm==i ){ aMatch[j].snStatus = SNIPPET_DESIRED; @@ -3644,9 +3641,11 @@ static int fulltextClose(sqlite3_vtab_cursor *pCursor){ fulltext_cursor *c = (fulltext_cursor *) pCursor; FTSTRACE(("FTS3 Close %p\n", c)); sqlite3_finalize(c->pStmt); - queryClear(&c->q); + sqlite3Fts3ExprFree(c->pExpr); snippetClear(&c->snippet); - if( c->result.nData!=0 ) dlrDestroy(&c->reader); + if( c->result.nData!=0 ){ + dlrDestroy(&c->reader); + } dataBufferDestroy(&c->result); sqlite3_free(c); return SQLITE_OK; @@ -3703,259 +3702,127 @@ static int termSelect(fulltext_vtab *v, int iColumn, const char *pTerm, int nTerm, int isPrefix, DocListType iType, DataBuffer *out); -/* Return a DocList corresponding to the query term *pTerm. If *pTerm -** is the first term of a phrase query, go ahead and evaluate the phrase -** query and return the doclist for the entire phrase query. +/* +** Return a DocList corresponding to the phrase *pPhrase. ** ** The resulting DL_DOCIDS doclist is stored in pResult, which is ** overwritten. */ -static int docListOfTerm( - fulltext_vtab *v, /* The full text index */ - int iColumn, /* column to restrict to. No restriction if >=nColumn */ - QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ - DataBuffer *pResult /* Write the result here */ +static int docListOfPhrase( + fulltext_vtab *pTab, /* The full text index */ + Fts3Phrase *pPhrase, /* Phrase to return a doclist corresponding to */ + DocListType eListType, /* Either DL_DOCIDS or DL_POSITIONS */ + DataBuffer *pResult /* Write the result here */ ){ - DataBuffer left, right, new; - int i, rc; - - /* No phrase search if no position info. */ - assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS ); + int ii; + int rc = SQLITE_OK; + int iCol = pPhrase->iColumn; + DocListType eType = eListType; + assert( eType==DL_POSITIONS || eType==DL_DOCIDS ); + if( pPhrase->nToken>1 ){ + eType = DL_POSITIONS; + } /* This code should never be called with buffered updates. */ - assert( v->nPendingData<0 ); + assert( pTab->nPendingData<0 ); - dataBufferInit(&left, 0); - rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix, - (0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS), &left); - if( rc ) return rc; - for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){ - /* If this token is connected to the next by a NEAR operator, and - ** the next token is the start of a phrase, then set nPhraseRight - ** to the number of tokens in the phrase. Otherwise leave it at 1. - */ - int nPhraseRight = 1; - while( (i+nPhraseRight)<=pQTerm->nPhrase - && pQTerm[i+nPhraseRight].nNear==0 - ){ - nPhraseRight++; - } - - dataBufferInit(&right, 0); - rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, - pQTerm[i].isPrefix, DL_POSITIONS, &right); - if( rc ){ - dataBufferDestroy(&left); - return rc; + for(ii=0; rc==SQLITE_OK && ii<pPhrase->nToken; ii++){ + DataBuffer tmp; + struct PhraseToken *p = &pPhrase->aToken[ii]; + rc = termSelect(pTab, iCol, p->z, p->n, p->isPrefix, eType, &tmp); + if( rc==SQLITE_OK ){ + if( ii==0 ){ + *pResult = tmp; + }else{ + DataBuffer res = *pResult; + dataBufferInit(pResult, 0); + if( ii==(pPhrase->nToken-1) ){ + eType = eListType; + } + docListPhraseMerge( + res.pData, res.nData, tmp.pData, tmp.nData, 0, 0, eType, pResult + ); + dataBufferDestroy(&res); + dataBufferDestroy(&tmp); + } } - dataBufferInit(&new, 0); - docListPhraseMerge(left.pData, left.nData, right.pData, right.nData, - pQTerm[i-1].nNear, pQTerm[i-1].iPhrase + nPhraseRight, - ((i<pQTerm->nPhrase) ? DL_POSITIONS : DL_DOCIDS), - &new); - dataBufferDestroy(&left); - dataBufferDestroy(&right); - left = new; - } - *pResult = left; - return SQLITE_OK; -} - -/* Add a new term pTerm[0..nTerm-1] to the query *q. -*/ -static void queryAdd(Query *q, const char *pTerm, int nTerm){ - QueryTerm *t; - ++q->nTerms; - q->pTerms = sqlite3_realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); - if( q->pTerms==0 ){ - q->nTerms = 0; - return; } - t = &q->pTerms[q->nTerms - 1]; - CLEAR(t); - t->pTerm = sqlite3_malloc(nTerm+1); - memcpy(t->pTerm, pTerm, nTerm); - t->pTerm[nTerm] = 0; - t->nTerm = nTerm; - t->isOr = q->nextIsOr; - t->isPrefix = 0; - q->nextIsOr = 0; - t->iColumn = q->nextColumn; - q->nextColumn = q->dfltColumn; -} -/* -** Check to see if the string zToken[0...nToken-1] matches any -** column name in the virtual table. If it does, -** return the zero-indexed column number. If not, return -1. -*/ -static int checkColumnSpecifier( - fulltext_vtab *pVtab, /* The virtual table */ - const char *zToken, /* Text of the token */ - int nToken /* Number of characters in the token */ -){ - int i; - for(i=0; i<pVtab->nColumn; i++){ - if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 - && pVtab->azColumn[i][nToken]==0 ){ - return i; - } - } - return -1; + return rc; } /* -** Parse the text at pSegment[0..nSegment-1]. Add additional terms -** to the query being assemblied in pQuery. -** -** inPhrase is true if pSegment[0..nSegement-1] is contained within -** double-quotes. If inPhrase is true, then the first term -** is marked with the number of terms in the phrase less one and -** OR and "-" syntax is ignored. If inPhrase is false, then every -** term found is marked with nPhrase=0 and OR and "-" syntax is significant. -*/ -static int tokenizeSegment( - sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ - const char *pSegment, int nSegment, /* Query expression being parsed */ - int inPhrase, /* True if within "..." */ - Query *pQuery /* Append results here */ +** Evaluate the full-text expression pExpr against fts3 table pTab. Write +** the results into pRes. +*/ +static int evalFts3Expr( + fulltext_vtab *pTab, /* Fts3 Virtual table object */ + Fts3Expr *pExpr, /* Parsed fts3 expression */ + DataBuffer *pRes /* OUT: Write results of the expression here */ ){ - const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; - sqlite3_tokenizer_cursor *pCursor; - int firstIndex = pQuery->nTerms; - int iCol; - int nTerm = 1; - - int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); - if( rc!=SQLITE_OK ) return rc; - pCursor->pTokenizer = pTokenizer; + int rc = SQLITE_OK; - while( 1 ){ - const char *pToken; - int nToken, iBegin, iEnd, iPos; + /* Initialize the output buffer. If this is an empty query (pExpr==0), + ** this is all that needs to be done. Empty queries produce empty + ** result sets. + */ + dataBufferInit(pRes, 0); - rc = pModule->xNext(pCursor, - &pToken, &nToken, - &iBegin, &iEnd, &iPos); - if( rc!=SQLITE_OK ) break; - if( !inPhrase && - pSegment[iEnd]==':' && - (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ - pQuery->nextColumn = iCol; - continue; - } - if( !inPhrase && pQuery->nTerms>0 && nToken==2 - && pSegment[iBegin+0]=='O' - && pSegment[iBegin+1]=='R' - ){ - pQuery->nextIsOr = 1; - continue; - } - if( !inPhrase && pQuery->nTerms>0 && !pQuery->nextIsOr && nToken==4 - && pSegment[iBegin+0]=='N' - && pSegment[iBegin+1]=='E' - && pSegment[iBegin+2]=='A' - && pSegment[iBegin+3]=='R' - ){ - QueryTerm *pTerm = &pQuery->pTerms[pQuery->nTerms-1]; - if( (iBegin+6)<nSegment - && pSegment[iBegin+4] == '/' - && pSegment[iBegin+5]>='0' && pSegment[iBegin+5]<='9' - ){ - pTerm->nNear = (pSegment[iBegin+5] - '0'); - nToken += 2; - if( pSegment[iBegin+6]>='0' && pSegment[iBegin+6]<=9 ){ - pTerm->nNear = pTerm->nNear * 10 + (pSegment[iBegin+6] - '0'); - iEnd++; - } - pModule->xNext(pCursor, &pToken, &nToken, &iBegin, &iEnd, &iPos); - } else { - pTerm->nNear = SQLITE_FTS3_DEFAULT_NEAR_PARAM; + if( pExpr ){ + if( pExpr->eType==FTSQUERY_PHRASE ){ + DocListType eType = DL_DOCIDS; + if( pExpr->pParent && pExpr->pParent->eType==FTSQUERY_NEAR ){ + eType = DL_POSITIONS; } - pTerm->nNear++; - continue; - } - - queryAdd(pQuery, pToken, nToken); - if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ - pQuery->pTerms[pQuery->nTerms-1].isNot = 1; - } - if( iEnd<nSegment && pSegment[iEnd]=='*' ){ - pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1; - } - pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; - if( inPhrase ){ - nTerm++; - } - } - - if( inPhrase && pQuery->nTerms>firstIndex ){ - pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; - } - - return pModule->xClose(pCursor); -} - -/* Parse a query string, yielding a Query object pQuery. -** -** The calling function will need to queryClear() to clean up -** the dynamically allocated memory held by pQuery. -*/ -static int parseQuery( - fulltext_vtab *v, /* The fulltext index */ - const char *zInput, /* Input text of the query string */ - int nInput, /* Size of the input text */ - int dfltColumn, /* Default column of the index to match against */ - Query *pQuery /* Write the parse results here. */ -){ - int iInput, inPhrase = 0; - int ii; - QueryTerm *aTerm; - - if( zInput==0 ) nInput = 0; - if( nInput<0 ) nInput = strlen(zInput); - pQuery->nTerms = 0; - pQuery->pTerms = NULL; - pQuery->nextIsOr = 0; - pQuery->nextColumn = dfltColumn; - pQuery->dfltColumn = dfltColumn; - pQuery->pFts = v; - - for(iInput=0; iInput<nInput; ++iInput){ - int i; - for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} - if( i>iInput ){ - tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, - pQuery); - } - iInput = i; - if( i<nInput ){ - assert( zInput[i]=='"' ); - inPhrase = !inPhrase; - } - } - - if( inPhrase ){ - /* unmatched quote */ - queryClear(pQuery); - return SQLITE_ERROR; - } + rc = docListOfPhrase(pTab, pExpr->pPhrase, eType, pRes); + }else{ + DataBuffer lhs; + DataBuffer rhs; - /* Modify the values of the QueryTerm.nPhrase variables to account for - ** the NEAR operator. For the purposes of QueryTerm.nPhrase, phrases - ** and tokens connected by the NEAR operator are handled as a single - ** phrase. See comments above the QueryTerm structure for details. - */ - aTerm = pQuery->pTerms; - for(ii=0; ii<pQuery->nTerms; ii++){ - if( aTerm[ii].nNear || aTerm[ii].nPhrase ){ - while (aTerm[ii+aTerm[ii].nPhrase].nNear) { - aTerm[ii].nPhrase += (1 + aTerm[ii+aTerm[ii].nPhrase+1].nPhrase); + dataBufferInit(&rhs, 0); + if( SQLITE_OK==(rc = evalFts3Expr(pTab, pExpr->pLeft, &lhs)) + && SQLITE_OK==(rc = evalFts3Expr(pTab, pExpr->pRight, &rhs)) + ){ + switch( pExpr->eType ){ + case FTSQUERY_NEAR: { + int nToken; + Fts3Expr *pLeft; + DocListType eType = DL_DOCIDS; + if( pExpr->pParent && pExpr->pParent->eType==FTSQUERY_NEAR ){ + eType = DL_POSITIONS; + } + pLeft = pExpr->pLeft; + while( pLeft->eType==FTSQUERY_NEAR ){ + pLeft=pLeft->pRight; + } + assert( pExpr->pRight->eType==FTSQUERY_PHRASE ); + assert( pLeft->eType==FTSQUERY_PHRASE ); + nToken = pLeft->pPhrase->nToken + pExpr->pRight->pPhrase->nToken; + docListPhraseMerge(lhs.pData, lhs.nData, rhs.pData, rhs.nData, + pExpr->nNear+1, nToken, eType, pRes + ); + break; + } + case FTSQUERY_NOT: { + docListExceptMerge(lhs.pData, lhs.nData, rhs.pData, rhs.nData,pRes); + break; + } + case FTSQUERY_AND: { + docListAndMerge(lhs.pData, lhs.nData, rhs.pData, rhs.nData, pRes); + break; + } + case FTSQUERY_OR: { + docListOrMerge(lhs.pData, lhs.nData, rhs.pData, rhs.nData, pRes); + break; + } + } } + dataBufferDestroy(&lhs); + dataBufferDestroy(&rhs); } } - return SQLITE_OK; + return rc; } /* TODO(shess) Refactor the code to remove this forward decl. */ @@ -3974,12 +3841,9 @@ static int fulltextQuery( const char *zInput, /* The query string */ int nInput, /* Number of bytes in zInput[] */ DataBuffer *pResult, /* Write the result doclist here */ - Query *pQuery /* Put parsed query string here */ + Fts3Expr **ppExpr /* Put parsed query string here */ ){ - int i, iNext, rc; - DataBuffer left, right, or, new; - int nNot = 0; - QueryTerm *aTerm; + int rc; /* TODO(shess) Instead of flushing pendingTerms, we could query for ** the relevant term and merge the doclist into what we receive from @@ -3991,86 +3855,20 @@ static int fulltextQuery( /* Flush any buffered updates before executing the query. */ rc = flushPendingTerms(v); - if( rc!=SQLITE_OK ) return rc; - - /* TODO(shess) I think that the queryClear() calls below are not - ** necessary, because fulltextClose() already clears the query. - */ - rc = parseQuery(v, zInput, nInput, iColumn, pQuery); - if( rc!=SQLITE_OK ) return rc; - - /* Empty or NULL queries return no results. */ - if( pQuery->nTerms==0 ){ - dataBufferInit(pResult, 0); - return SQLITE_OK; - } - - /* Merge AND terms. */ - /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */ - aTerm = pQuery->pTerms; - for(i = 0; i<pQuery->nTerms; i=iNext){ - if( aTerm[i].isNot ){ - /* Handle all NOT terms in a separate pass */ - nNot++; - iNext = i + aTerm[i].nPhrase+1; - continue; - } - iNext = i + aTerm[i].nPhrase + 1; - rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); - if( rc ){ - if( i!=nNot ) dataBufferDestroy(&left); - queryClear(pQuery); - return rc; - } - while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ - rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or); - iNext += aTerm[iNext].nPhrase + 1; - if( rc ){ - if( i!=nNot ) dataBufferDestroy(&left); - dataBufferDestroy(&right); - queryClear(pQuery); - return rc; - } - dataBufferInit(&new, 0); - docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new); - dataBufferDestroy(&right); - dataBufferDestroy(&or); - right = new; - } - if( i==nNot ){ /* first term processed. */ - left = right; - }else{ - dataBufferInit(&new, 0); - docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new); - dataBufferDestroy(&right); - dataBufferDestroy(&left); - left = new; - } - } - - if( nNot==pQuery->nTerms ){ - /* We do not yet know how to handle a query of only NOT terms */ - return SQLITE_ERROR; + if( rc!=SQLITE_OK ){ + return rc; } - /* Do the EXCEPT terms */ - for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ - if( !aTerm[i].isNot ) continue; - rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); - if( rc ){ - queryClear(pQuery); - dataBufferDestroy(&left); - return rc; - } - dataBufferInit(&new, 0); - docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new); - dataBufferDestroy(&right); - dataBufferDestroy(&left); - left = new; + /* Parse the query passed to the MATCH operator. */ + rc = sqlite3Fts3ExprParse(v->pTokenizer, + v->azColumn, v->nColumn, iColumn, zInput, nInput, ppExpr + ); + if( rc!=SQLITE_OK ){ + assert( 0==(*ppExpr) ); + return rc; } - *pResult = left; - return rc; + return evalFts3Expr(v, *ppExpr, pResult); } /* @@ -4150,10 +3948,10 @@ static int fulltextFilter( default: /* full-text search */ { + int iCol = idxNum-QUERY_FULLTEXT; const char *zQuery = (const char *)sqlite3_value_text(argv[0]); assert( idxNum<=QUERY_FULLTEXT+v->nColumn); assert( argc==1 ); - queryClear(&c->q); if( c->result.nData!=0 ){ /* This case happens if the same cursor is used repeatedly. */ dlrDestroy(&c->reader); @@ -4161,7 +3959,7 @@ static int fulltextFilter( }else{ dataBufferInit(&c->result, 0); } - rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q); + rc = fulltextQuery(v, iCol, zQuery, -1, &c->result, &c->pExpr); if( rc!=SQLITE_OK ) return rc; if( c->result.nData!=0 ){ dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData); @@ -6045,9 +5843,14 @@ static int loadSegment(fulltext_vtab *v, const char *pData, int nData, /* Scan the database and merge together the posting lists for the term ** into *out. */ -static int termSelect(fulltext_vtab *v, int iColumn, - const char *pTerm, int nTerm, int isPrefix, - DocListType iType, DataBuffer *out){ +static int termSelect( + fulltext_vtab *v, + int iColumn, + const char *pTerm, int nTerm, /* Term to query for */ + int isPrefix, /* True for a prefix search */ + DocListType iType, + DataBuffer *out /* Write results here */ +){ DataBuffer doclist; sqlite3_stmt *s; int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s); @@ -6057,6 +5860,7 @@ static int termSelect(fulltext_vtab *v, int iColumn, assert( v->nPendingData<0 ); dataBufferInit(&doclist, 0); + dataBufferInit(out, 0); /* Traverse the segments from oldest to newest so that newer doclist ** elements for given docids overwrite older elements. @@ -6606,7 +6410,7 @@ static void optimizeFunc(sqlite3_context *pContext, i++; } - /* If we managed to succesfully read them all, optimize them. */ + /* If we managed to successfully read them all, optimize them. */ if( rc==SQLITE_DONE ){ assert( i==nReaders ); rc = optimizeInternal(v, readers, nReaders, &writer); @@ -7174,6 +6978,10 @@ int sqlite3Fts3Init(sqlite3 *db){ } } +#ifdef SQLITE_TEST + sqlite3Fts3ExprInitTestInterface(db); +#endif + /* Create the virtual table wrapper around the hash-table and overload ** the two scalar functions. If this is successful, register the ** module with sqlite. @@ -7193,7 +7001,7 @@ int sqlite3Fts3Init(sqlite3 *db){ ); } - /* An error has occured. Delete the hash table and return the error code. */ + /* An error has occurred. Delete the hash table and return the error code. */ assert( rc!=SQLITE_OK ); if( pHash ){ sqlite3Fts3HashClear(pHash); |