diff options
author | Carlos Garnacho <carlos@lanedo.com> | 2013-01-21 16:55:33 +0100 |
---|---|---|
committer | Martyn Russell <martyn@lanedo.com> | 2013-02-04 15:43:13 +0000 |
commit | ee253d914609656ddcf3c4b9c3f0a0ffc07c00c8 (patch) | |
tree | 7b22d3bbcc669eaa805911d4ae63e073689faa9d /src/libtracker-fts | |
parent | 5f30d16fcdf1d07802c0fb02edefce51c8bbcfaf (diff) | |
download | tracker-ee253d914609656ddcf3c4b9c3f0a0ffc07c00c8.tar.gz |
Update FTS code
Diffstat (limited to 'src/libtracker-fts')
-rw-r--r-- | src/libtracker-fts/Makefile.am | 2 | ||||
-rw-r--r-- | src/libtracker-fts/fts3.c | 889 | ||||
-rw-r--r-- | src/libtracker-fts/fts3.h | 9 | ||||
-rw-r--r-- | src/libtracker-fts/fts3Int.h | 91 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_aux.c | 6 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_expr.c | 75 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_icu.c | 11 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_porter.c | 3 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_snippet.c | 91 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_term.c | 16 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_test.c | 213 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_tokenizer.c | 39 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_tokenizer.h | 11 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_tokenizer1.c | 1 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_unicode.c | 393 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_unicode2.c | 366 | ||||
-rw-r--r-- | src/libtracker-fts/fts3_write.c | 2490 |
17 files changed, 4228 insertions, 478 deletions
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am index 1780f0623..6e4b7f0ba 100644 --- a/src/libtracker-fts/Makefile.am +++ b/src/libtracker-fts/Makefile.am @@ -23,6 +23,8 @@ fts4_sources = \ fts3_tokenizer1.c \ fts3_tokenizer.c \ fts3_tokenizer.h \ + fts3_unicode.c \ + fts3_unicode2.c \ fts3_write.c libtracker_fts_la_SOURCES = \ diff --git a/src/libtracker-fts/fts3.c b/src/libtracker-fts/fts3.c index f29970dd2..3dc62ba8d 100644 --- a/src/libtracker-fts/fts3.c +++ b/src/libtracker-fts/fts3.c @@ -70,7 +70,7 @@ ** A doclist is stored like this: ** ** array { -** varint docid; +** varint docid; (delta from previous doclist) ** array { (position list for column 0) ** varint position; (2 more than the delta from previous position) ** } @@ -101,8 +101,8 @@ ** at D signals the start of a new column; the 1 at E indicates that the ** new column is column number 1. There are two positions at 12 and 45 ** (14-2 and 35-2+12). The 0 at H indicate the end-of-document. The -** 234 at I is the next docid. It has one position 72 (72-2) and then -** terminates with the 0 at K. +** 234 at I is the delta to next docid (357). It has one position 70 +** (72-2) and then terminates with the 0 at K. ** ** A "position-list" is the list of positions for multiple columns for ** a single docid. A "column-list" is the set of positions for a single @@ -286,10 +286,6 @@ ** will eventually overtake the earlier data and knock it out. The ** query logic likewise merges doclists so that newer data knocks out ** older data. -** -** TODO(shess) Provide a VACUUM type operation to clear out all -** deletions and duplications. This would basically be a forced merge -** into a single segment. */ #include "fts3Int.h" @@ -308,8 +304,8 @@ #include "fts3.h" #ifndef SQLITE_CORE - # include "sqlite3ext.h" - SQLITE_EXTENSION_INIT1 +# include "sqlite3ext.h" + SQLITE_EXTENSION_INIT1 #endif static int fts3EvalNext(Fts3Cursor *pCsr); @@ -438,7 +434,7 @@ static void fts3GetReverseVarint( sqlite3_int64 *pVal ){ sqlite3_int64 iVal; - char *p = *pp; + char *p; /* Pointer p now points at the first byte past the varint we are ** interested in. So, unless the doclist is corrupt, the 0x80 bit is @@ -468,6 +464,8 @@ static int fts3DisconnectMethod(sqlite3_vtab *pVtab){ sqlite3_free(p->zSegmentsTbl); sqlite3_free(p->zReadExprlist); sqlite3_free(p->zWriteExprlist); + sqlite3_free(p->zContentTbl); + sqlite3_free(p->zLanguageid); /* Invoke the tokenizer destructor to free the tokenizer. */ p->pTokenizer->pModule->xDestroy(p->pTokenizer); @@ -507,16 +505,19 @@ static void fts3DbExec( ** The xDestroy() virtual table method. */ static int fts3DestroyMethod(sqlite3_vtab *pVtab){ - int rc = SQLITE_OK; /* Return code */ Fts3Table *p = (Fts3Table *)pVtab; - sqlite3 *db = p->db; + int rc = SQLITE_OK; /* Return code */ + const char *zDb = p->zDb; /* Name of database (e.g. "main", "temp") */ + sqlite3 *db = p->db; /* Database handle */ /* Drop the shadow tables */ - fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_content'", p->zDb, p->zName); - fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_segments'", p->zDb,p->zName); - fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_segdir'", p->zDb, p->zName); - fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_docsize'", p->zDb, p->zName); - fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_stat'", p->zDb, p->zName); + if( p->zContentTbl==0 ){ + fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_content'", zDb, p->zName); + } + fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_segments'", zDb,p->zName); + fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_segdir'", zDb, p->zName); + fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_docsize'", zDb, p->zName); + fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_stat'", zDb, p->zName); /* If everything has worked, invoke fts3DisconnectMethod() to free the ** memory associated with the Fts3Table structure and return SQLITE_OK. @@ -541,7 +542,9 @@ static void fts3DeclareVtab(int *pRc, Fts3Table *p){ int rc; /* Return code */ char *zSql; /* SQL statement passed to declare_vtab() */ char *zCols; /* List of user defined columns */ + const char *zLanguageid; + zLanguageid = (p->zLanguageid ? p->zLanguageid : "__langid"); sqlite3_vtab_config(p->db, SQLITE_VTAB_CONSTRAINT_SUPPORT, 1); /* Create a list of user columns for the virtual table */ @@ -552,7 +555,8 @@ static void fts3DeclareVtab(int *pRc, Fts3Table *p){ /* Create the whole "CREATE TABLE" statement to pass to SQLite */ zSql = sqlite3_mprintf( - "CREATE TABLE x(%s %Q HIDDEN, docid HIDDEN)", zCols, p->zName + "CREATE TABLE x(%s %Q HIDDEN, docid HIDDEN, %Q HIDDEN)", + zCols, p->zName, zLanguageid ); if( !zCols || !zSql ){ rc = SQLITE_NOMEM; @@ -567,6 +571,18 @@ static void fts3DeclareVtab(int *pRc, Fts3Table *p){ } /* +** Create the %_stat table if it does not already exist. +*/ +void sqlite3Fts3CreateStatTable(int *pRc, Fts3Table *p){ + fts3DbExec(pRc, p->db, + "CREATE TABLE IF NOT EXISTS %Q.'%q_stat'" + "(id INTEGER PRIMARY KEY, value BLOB);", + p->zDb, p->zName + ); + if( (*pRc)==SQLITE_OK ) p->bHasStat = 1; +} + +/* ** Create the backing store tables (%_content, %_segments and %_segdir) ** required by the FTS3 table passed as the only argument. This is done ** as part of the vtab xCreate() method. @@ -578,23 +594,31 @@ static void fts3DeclareVtab(int *pRc, Fts3Table *p){ static int fts3CreateTables(Fts3Table *p){ int rc = SQLITE_OK; /* Return code */ int i; /* Iterator variable */ - char *zContentCols; /* Columns of %_content table */ sqlite3 *db = p->db; /* The database connection */ - /* Create a list of user columns for the content table */ - zContentCols = sqlite3_mprintf("docid INTEGER PRIMARY KEY"); - for(i=0; zContentCols && i<p->nColumn; i++){ - char *z = p->azColumn[i]; - zContentCols = sqlite3_mprintf("%z, 'c%d%q'", zContentCols, i, z); + if( p->zContentTbl==0 ){ + const char *zLanguageid = p->zLanguageid; + char *zContentCols; /* Columns of %_content table */ + + /* Create a list of user columns for the content table */ + zContentCols = sqlite3_mprintf("docid INTEGER PRIMARY KEY"); + for(i=0; zContentCols && i<p->nColumn; i++){ + char *z = p->azColumn[i]; + zContentCols = sqlite3_mprintf("%z, 'c%d%q'", zContentCols, i, z); + } + if( zLanguageid && zContentCols ){ + zContentCols = sqlite3_mprintf("%z, langid", zContentCols, zLanguageid); + } + if( zContentCols==0 ) rc = SQLITE_NOMEM; + + /* Create the content table */ + fts3DbExec(&rc, db, + "CREATE TABLE %Q.'%q_content'(%s)", + p->zDb, p->zName, zContentCols + ); + sqlite3_free(zContentCols); } - if( zContentCols==0 ) rc = SQLITE_NOMEM; - /* Create the content table */ - fts3DbExec(&rc, db, - "CREATE TABLE %Q.'%q_content'(%s)", - p->zDb, p->zName, zContentCols - ); - sqlite3_free(zContentCols); /* Create other tables */ fts3DbExec(&rc, db, "CREATE TABLE %Q.'%q_segments'(blockid INTEGER PRIMARY KEY, block BLOB);", @@ -618,11 +642,9 @@ static int fts3CreateTables(Fts3Table *p){ p->zDb, p->zName ); } + assert( p->bHasStat==p->bFts4 ); if( p->bHasStat ){ - fts3DbExec(&rc, db, - "CREATE TABLE %Q.'%q_stat'(id INTEGER PRIMARY KEY, value BLOB);", - p->zDb, p->zName - ); + sqlite3Fts3CreateStatTable(&rc, p); } return rc; } @@ -704,6 +726,7 @@ static void fts3Appendf( char *z; va_start(ap, zFormat); z = sqlite3_vmprintf(zFormat, ap); + va_end(ap); if( z && *pz ){ char *z2 = sqlite3_mprintf("%s%s", *pz, z); sqlite3_free(z); @@ -728,7 +751,7 @@ static void fts3Appendf( static char *fts3QuoteId(char const *zInput){ int nRet; char *zRet; - nRet = 2 + strlen(zInput)*2 + 1; + nRet = 2 + (int)strlen(zInput)*2 + 1; zRet = sqlite3_malloc(nRet); if( zRet ){ int i; @@ -745,8 +768,8 @@ static char *fts3QuoteId(char const *zInput){ } /* -** Return a list of comma separated SQL expressions that could be used -** in a SELECT statement such as the following: +** Return a list of comma separated SQL expressions and a FROM clause that +** could be used in a SELECT statement such as the following: ** ** SELECT <list of expressions> FROM %_content AS x ... ** @@ -757,7 +780,7 @@ static char *fts3QuoteId(char const *zInput){ ** table has the three user-defined columns "a", "b", and "c", the following ** string is returned: ** -** "docid, unzip(x.'a'), unzip(x.'b'), unzip(x.'c')" +** "docid, unzip(x.'a'), unzip(x.'b'), unzip(x.'c') FROM %_content AS x" ** ** The pointer returned points to a buffer allocated by sqlite3_malloc(). It ** is the responsibility of the caller to eventually free it. @@ -773,16 +796,34 @@ static char *fts3ReadExprList(Fts3Table *p, const char *zFunc, int *pRc){ char *zFunction; int i; - if( !zFunc ){ - zFunction = ""; + if( p->zContentTbl==0 ){ + if( !zFunc ){ + zFunction = ""; + }else{ + zFree = zFunction = fts3QuoteId(zFunc); + } + fts3Appendf(pRc, &zRet, "docid"); + for(i=0; i<p->nColumn; i++){ + fts3Appendf(pRc, &zRet, ",%s(x.'c%d%q')", zFunction, i, p->azColumn[i]); + } + if( p->zLanguageid ){ + fts3Appendf(pRc, &zRet, ", x.%Q", "langid"); + } + sqlite3_free(zFree); }else{ - zFree = zFunction = fts3QuoteId(zFunc); - } - fts3Appendf(pRc, &zRet, "docid"); - for(i=0; i<p->nColumn; i++){ - fts3Appendf(pRc, &zRet, ",%s(x.'c%d%q')", zFunction, i, p->azColumn[i]); + fts3Appendf(pRc, &zRet, "rowid"); + for(i=0; i<p->nColumn; i++){ + fts3Appendf(pRc, &zRet, ", x.'%q'", p->azColumn[i]); + } + if( p->zLanguageid ){ + fts3Appendf(pRc, &zRet, ", x.%Q", p->zLanguageid); + } } - sqlite3_free(zFree); + fts3Appendf(pRc, &zRet, " FROM '%q'.'%q%s' AS x", + p->zDb, + (p->zContentTbl ? p->zContentTbl : p->zName), + (p->zContentTbl ? "" : "_content") + ); return zRet; } @@ -821,6 +862,9 @@ static char *fts3WriteExprList(Fts3Table *p, const char *zFunc, int *pRc){ for(i=0; i<p->nColumn; i++){ fts3Appendf(pRc, &zRet, ",%s(?)", zFunction); } + if( p->zLanguageid ){ + fts3Appendf(pRc, &zRet, ", ?"); + } sqlite3_free(zFree); return zRet; } @@ -839,7 +883,7 @@ static char *fts3WriteExprList(Fts3Table *p, const char *zFunc, int *pRc){ ** This function is used when parsing the "prefix=" FTS4 parameter. */ static int fts3GobbleInt(const char **pp, int *pnOut){ - const char *p = *pp; /* Iterator pointer */ + const char *p; /* Iterator pointer */ int nInt = 0; /* Output value */ for(p=*pp; p[0]>='0' && p[0]<='9'; p++){ @@ -907,6 +951,91 @@ static int fts3PrefixParameter( } /* +** This function is called when initializing an FTS4 table that uses the +** content=xxx option. It determines the number of and names of the columns +** of the new FTS4 table. +** +** The third argument passed to this function is the value passed to the +** config=xxx option (i.e. "xxx"). This function queries the database for +** a table of that name. If found, the output variables are populated +** as follows: +** +** *pnCol: Set to the number of columns table xxx has, +** +** *pnStr: Set to the total amount of space required to store a copy +** of each columns name, including the nul-terminator. +** +** *pazCol: Set to point to an array of *pnCol strings. Each string is +** the name of the corresponding column in table xxx. The array +** and its contents are allocated using a single allocation. It +** is the responsibility of the caller to free this allocation +** by eventually passing the *pazCol value to sqlite3_free(). +** +** If the table cannot be found, an error code is returned and the output +** variables are undefined. Or, if an OOM is encountered, SQLITE_NOMEM is +** returned (and the output variables are undefined). +*/ +static int fts3ContentColumns( + sqlite3 *db, /* Database handle */ + const char *zDb, /* Name of db (i.e. "main", "temp" etc.) */ + const char *zTbl, /* Name of content table */ + const char ***pazCol, /* OUT: Malloc'd array of column names */ + int *pnCol, /* OUT: Size of array *pazCol */ + int *pnStr /* OUT: Bytes of string content */ +){ + int rc = SQLITE_OK; /* Return code */ + char *zSql; /* "SELECT *" statement on zTbl */ + sqlite3_stmt *pStmt = 0; /* Compiled version of zSql */ + + zSql = sqlite3_mprintf("SELECT * FROM %Q.%Q", zDb, zTbl); + if( !zSql ){ + rc = SQLITE_NOMEM; + }else{ + rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0); + } + sqlite3_free(zSql); + + if( rc==SQLITE_OK ){ + const char **azCol; /* Output array */ + int nStr = 0; /* Size of all column names (incl. 0x00) */ + int nCol; /* Number of table columns */ + int i; /* Used to iterate through columns */ + + /* Loop through the returned columns. Set nStr to the number of bytes of + ** space required to store a copy of each column name, including the + ** nul-terminator byte. */ + nCol = sqlite3_column_count(pStmt); + for(i=0; i<nCol; i++){ + const char *zCol = sqlite3_column_name(pStmt, i); + nStr += (int)strlen(zCol) + 1; + } + + /* Allocate and populate the array to return. */ + azCol = (const char **)sqlite3_malloc(sizeof(char *) * nCol + nStr); + if( azCol==0 ){ + rc = SQLITE_NOMEM; + }else{ + char *p = (char *)&azCol[nCol]; + for(i=0; i<nCol; i++){ + const char *zCol = sqlite3_column_name(pStmt, i); + int n = (int)strlen(zCol)+1; + memcpy(p, zCol, n); + azCol[i] = p; + p += n; + } + } + sqlite3_finalize(pStmt); + + /* Set the output variables. */ + *pnCol = nCol; + *pnStr = nStr; + *pazCol = azCol; + } + + return rc; +} + +/* ** This function is the implementation of both the xConnect and xCreate ** methods of the FTS3 virtual table. ** @@ -950,6 +1079,8 @@ static int fts3InitVtab( char *zPrefix = 0; /* Prefix parameter value (or NULL) */ char *zCompress = 0; /* compress=? parameter (or NULL) */ char *zUncompress = 0; /* uncompress=? parameter (or NULL) */ + char *zContent = 0; /* content=? parameter (or NULL) */ + char *zLanguageid = 0; /* languageid=? parameter (or NULL) */ assert( strlen(argv[0])==4 ); assert( (sqlite3_strnicmp(argv[0], "fts4", 4)==0 && isFts4) @@ -993,13 +1124,14 @@ static int fts3InitVtab( struct Fts4Option { const char *zOpt; int nOpt; - char **pzVar; } aFts4Opt[] = { - { "matchinfo", 9, 0 }, /* 0 -> MATCHINFO */ - { "prefix", 6, 0 }, /* 1 -> PREFIX */ - { "compress", 8, 0 }, /* 2 -> COMPRESS */ - { "uncompress", 10, 0 }, /* 3 -> UNCOMPRESS */ - { "order", 5, 0 } /* 4 -> ORDER */ + { "matchinfo", 9 }, /* 0 -> MATCHINFO */ + { "prefix", 6 }, /* 1 -> PREFIX */ + { "compress", 8 }, /* 2 -> COMPRESS */ + { "uncompress", 10 }, /* 3 -> UNCOMPRESS */ + { "order", 5 }, /* 4 -> ORDER */ + { "content", 7 }, /* 5 -> CONTENT */ + { "languageid", 10 } /* 6 -> LANGUAGEID */ }; int iOpt; @@ -1045,13 +1177,26 @@ static int fts3InitVtab( case 4: /* ORDER */ if( (strlen(zVal)!=3 || sqlite3_strnicmp(zVal, "asc", 3)) - && (strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "desc", 3)) + && (strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "desc", 4)) ){ *pzErr = sqlite3_mprintf("unrecognized order: %s", zVal); rc = SQLITE_ERROR; } bDescIdx = (zVal[0]=='d' || zVal[0]=='D'); break; + + case 5: /* CONTENT */ + sqlite3_free(zContent); + zContent = zVal; + zVal = 0; + break; + + case 6: /* LANGUAGEID */ + assert( iOpt==6 ); + sqlite3_free(zLanguageid); + zLanguageid = zVal; + zVal = 0; + break; } } sqlite3_free(zVal); @@ -1064,6 +1209,39 @@ static int fts3InitVtab( aCol[nCol++] = z; } } + + /* If a content=xxx option was specified, the following: + ** + ** 1. Ignore any compress= and uncompress= options. + ** + ** 2. If no column names were specified as part of the CREATE VIRTUAL + ** TABLE statement, use all columns from the content table. + */ + if( rc==SQLITE_OK && zContent ){ + sqlite3_free(zCompress); + sqlite3_free(zUncompress); + zCompress = 0; + zUncompress = 0; + if( nCol==0 ){ + sqlite3_free((void*)aCol); + aCol = 0; + rc = fts3ContentColumns(db, argv[1], zContent, &aCol, &nCol, &nString); + + /* If a languageid= option was specified, remove the language id + ** column from the aCol[] array. */ + if( rc==SQLITE_OK && zLanguageid ){ + int j; + for(j=0; j<nCol; j++){ + if( sqlite3_stricmp(zLanguageid, aCol[j])==0 ){ + int k; + for(k=j; k<nCol; k++) aCol[k] = aCol[k+1]; + nCol--; + break; + } + } + } + } + } if( rc!=SQLITE_OK ) goto fts3_init_out; if( nCol==0 ){ @@ -1107,7 +1285,13 @@ static int fts3InitVtab( p->nMaxPendingData = FTS3_MAX_PENDING_DATA; p->bHasDocsize = (isFts4 && bNoDocsize==0); p->bHasStat = isFts4; + p->bFts4 = isFts4; p->bDescIdx = bDescIdx; + p->bAutoincrmerge = 0xff; /* 0xff means setting unknown */ + p->zContentTbl = zContent; + p->zLanguageid = zLanguageid; + zContent = 0; + zLanguageid = 0; TESTONLY( p->inTransaction = -1 ); TESTONLY( p->mxSavepoint = -1 ); @@ -1156,6 +1340,16 @@ static int fts3InitVtab( rc = fts3CreateTables(p); } + /* Check to see if a legacy fts3 table has been "upgraded" by the + ** addition of a %_stat table so that it can use incremental merge. + */ + if( !isFts4 && !isCreate ){ + int rc2 = SQLITE_OK; + fts3DbExec(&rc2, db, "SELECT 1 FROM %Q.'%q_stat' WHERE id=2", + p->zDb, p->zName); + if( rc2==SQLITE_OK ) p->bHasStat = 1; + } + /* Figure out the page-size for the database. This is required in order to ** estimate the cost of loading large doclists from the database. */ fts3DatabasePageSize(&rc, p); @@ -1169,6 +1363,8 @@ fts3_init_out: sqlite3_free(aIndex); sqlite3_free(zCompress); sqlite3_free(zUncompress); + sqlite3_free(zContent); + sqlite3_free(zLanguageid); sqlite3_free((void *)aCol); if( rc!=SQLITE_OK ){ if( p ){ @@ -1220,6 +1416,7 @@ static int fts3BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ Fts3Table *p = (Fts3Table *)pVTab; int i; /* Iterator variable */ int iCons = -1; /* Index of constraint to use */ + int iLangidCons = -1; /* Index of langid=x constraint, if present */ /* By default use a full table scan. This is an expensive option, ** so search through the constraints to see if a more efficient @@ -1232,7 +1429,8 @@ static int fts3BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ if( pCons->usable==0 ) continue; /* A direct lookup on the rowid or docid column. Assign a cost of 1.0. */ - if( pCons->op==SQLITE_INDEX_CONSTRAINT_EQ + if( iCons<0 + && pCons->op==SQLITE_INDEX_CONSTRAINT_EQ && (pCons->iColumn<0 || pCons->iColumn==p->nColumn+1 ) ){ pInfo->idxNum = FTS3_DOCID_SEARCH; @@ -1255,7 +1453,13 @@ static int fts3BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ pInfo->idxNum = FTS3_FULLTEXT_SEARCH + pCons->iColumn; pInfo->estimatedCost = 2.0; iCons = i; - break; + } + + /* Equality constraint on the langid column */ + if( pCons->op==SQLITE_INDEX_CONSTRAINT_EQ + && pCons->iColumn==p->nColumn + 2 + ){ + iLangidCons = i; } } @@ -1263,6 +1467,9 @@ static int fts3BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ pInfo->aConstraintUsage[iCons].argvIndex = 1; pInfo->aConstraintUsage[iCons].omit = 1; } + if( iLangidCons>=0 ){ + pInfo->aConstraintUsage[iLangidCons].argvIndex = 2; + } /* Regardless of the strategy selected, FTS can deliver rows in rowid (or ** docid) order. Both ascending and descending are possible. @@ -1321,34 +1528,63 @@ static int fts3CloseMethod(sqlite3_vtab_cursor *pCursor){ } /* +** If pCsr->pStmt has not been prepared (i.e. if pCsr->pStmt==0), then +** compose and prepare an SQL statement of the form: +** +** "SELECT <columns> FROM %_content WHERE rowid = ?" +** +** (or the equivalent for a content=xxx table) and set pCsr->pStmt to +** it. If an error occurs, return an SQLite error code. +** +** Otherwise, set *ppStmt to point to pCsr->pStmt and return SQLITE_OK. +*/ +static int fts3CursorSeekStmt(Fts3Cursor *pCsr, sqlite3_stmt **ppStmt){ + int rc = SQLITE_OK; + if( pCsr->pStmt==0 ){ + Fts3Table *p = (Fts3Table *)pCsr->base.pVtab; + char *zSql; + zSql = sqlite3_mprintf("SELECT %s WHERE rowid = ?", p->zReadExprlist); + if( !zSql ) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &pCsr->pStmt, 0); + sqlite3_free(zSql); + } + *ppStmt = pCsr->pStmt; + return rc; +} + +/* ** Position the pCsr->pStmt statement so that it is on the row ** of the %_content table that contains the last match. Return ** SQLITE_OK on success. */ static int fts3CursorSeek(sqlite3_context *pContext, Fts3Cursor *pCsr){ + int rc = SQLITE_OK; if( pCsr->isRequireSeek ){ - sqlite3_bind_int64(pCsr->pStmt, 1, pCsr->iPrevId); - pCsr->isRequireSeek = 0; - if( SQLITE_ROW==sqlite3_step(pCsr->pStmt) ){ - return SQLITE_OK; - }else{ - int rc = sqlite3_reset(pCsr->pStmt); - if( rc==SQLITE_OK ){ - /* If no row was found and no error has occured, then the %_content - ** table is missing a row that is present in the full-text index. - ** The data structures are corrupt. - */ - rc = SQLITE_CORRUPT_VTAB; - } - pCsr->isEof = 1; - if( pContext ){ - sqlite3_result_error_code(pContext, rc); + sqlite3_stmt *pStmt = 0; + + rc = fts3CursorSeekStmt(pCsr, &pStmt); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pCsr->pStmt, 1, pCsr->iPrevId); + pCsr->isRequireSeek = 0; + if( SQLITE_ROW==sqlite3_step(pCsr->pStmt) ){ + return SQLITE_OK; + }else{ + rc = sqlite3_reset(pCsr->pStmt); + if( rc==SQLITE_OK && ((Fts3Table *)pCsr->base.pVtab)->zContentTbl==0 ){ + /* If no row was found and no error has occured, then the %_content + ** table is missing a row that is present in the full-text index. + ** The data structures are corrupt. */ + rc = FTS_CORRUPT_VTAB; + pCsr->isEof = 1; + } } - return rc; } - }else{ - return SQLITE_OK; } + + if( rc!=SQLITE_OK && pContext ){ + sqlite3_result_error_code(pContext, rc); + } + return rc; } /* @@ -1398,7 +1634,7 @@ static int fts3ScanInteriorNode( zCsr += sqlite3Fts3GetVarint(zCsr, &iChild); zCsr += sqlite3Fts3GetVarint(zCsr, &iChild); if( zCsr>zEnd ){ - return SQLITE_CORRUPT_VTAB; + return FTS_CORRUPT_VTAB; } while( zCsr<zEnd && (piFirst || piLast) ){ @@ -1416,7 +1652,7 @@ static int fts3ScanInteriorNode( zCsr += sqlite3Fts3GetVarint32(zCsr, &nSuffix); if( nPrefix<0 || nSuffix<0 || &zCsr[nSuffix]>zEnd ){ - rc = SQLITE_CORRUPT_VTAB; + rc = FTS_CORRUPT_VTAB; goto finish_scan; } if( nPrefix+nSuffix>nAlloc ){ @@ -1429,6 +1665,7 @@ static int fts3ScanInteriorNode( } zBuffer = zNew; } + assert( zBuffer ); memcpy(&zBuffer[nPrefix], zCsr, nSuffix); nBuffer = nPrefix + nSuffix; zCsr += nSuffix; @@ -1787,7 +2024,7 @@ static int fts3PoslistPhraseMerge( char **pp1, /* IN/OUT: Left input list */ char **pp2 /* IN/OUT: Right input list */ ){ - char *p = (pp ? *pp : 0); + char *p = *pp; char *p1 = *pp1; char *p2 = *pp2; int iCol1 = 0; @@ -1796,7 +2033,7 @@ static int fts3PoslistPhraseMerge( /* Never set both isSaveLeft and isExact for the same invocation. */ assert( isSaveLeft==0 || isExact==0 ); - assert( *p1!=0 && *p2!=0 ); + assert( p!=0 && *p1!=0 && *p2!=0 ); if( *p1==POS_COLUMN ){ p1++; p1 += sqlite3Fts3GetVarint32(p1, &iCol1); @@ -1813,7 +2050,7 @@ static int fts3PoslistPhraseMerge( sqlite3_int64 iPos1 = 0; sqlite3_int64 iPos2 = 0; - if( pp && iCol1 ){ + if( iCol1 ){ *p++ = POS_COLUMN; p += sqlite3Fts3PutVarint(p, iCol1); } @@ -1828,16 +2065,10 @@ static int fts3PoslistPhraseMerge( || (isExact==0 && iPos2>iPos1 && iPos2<=iPos1+nToken) ){ sqlite3_int64 iSave; - if( !pp ){ - fts3PoslistCopy(0, &p2); - fts3PoslistCopy(0, &p1); - *pp1 = p1; - *pp2 = p2; - return 1; - } iSave = isSaveLeft ? iPos1 : iPos2; fts3PutDeltaVarint(&p, &iPrev, iSave+2); iPrev -= 2; pSave = 0; + assert( p ); } if( (!isSaveLeft && iPos2<=(iPos1+nToken)) || iPos2<=iPos1 ){ if( (*p2&0xFE)==0 ) break; @@ -1886,7 +2117,7 @@ static int fts3PoslistPhraseMerge( fts3PoslistCopy(0, &p1); *pp1 = p1; *pp2 = p2; - if( !pp || *pp==p ){ + if( *pp==p ){ return 0; } *p++ = 0x00; @@ -2122,7 +2353,7 @@ static int fts3DoclistOrMerge( } *paOut = aOut; - *pnOut = (p-aOut); + *pnOut = (int)(p-aOut); assert( *pnOut<=n1+n2+FTS3_VARINT_MAX-1 ); return SQLITE_OK; } @@ -2186,7 +2417,57 @@ static void fts3DoclistPhraseMerge( } } - *pnRight = p - aOut; + *pnRight = (int)(p - aOut); +} + +/* +** Argument pList points to a position list nList bytes in size. This +** function checks to see if the position list contains any entries for +** a token in position 0 (of any column). If so, it writes argument iDelta +** to the output buffer pOut, followed by a position list consisting only +** of the entries from pList at position 0, and terminated by an 0x00 byte. +** The value returned is the number of bytes written to pOut (if any). +*/ +int sqlite3Fts3FirstFilter( + sqlite3_int64 iDelta, /* Varint that may be written to pOut */ + char *pList, /* Position list (no 0x00 term) */ + int nList, /* Size of pList in bytes */ + char *pOut /* Write output here */ +){ + int nOut = 0; + int bWritten = 0; /* True once iDelta has been written */ + char *p = pList; + char *pEnd = &pList[nList]; + + if( *p!=0x01 ){ + if( *p==0x02 ){ + nOut += sqlite3Fts3PutVarint(&pOut[nOut], iDelta); + pOut[nOut++] = 0x02; + bWritten = 1; + } + fts3ColumnlistCopy(0, &p); + } + + while( p<pEnd && *p==0x01 ){ + sqlite3_int64 iCol; + p++; + p += sqlite3Fts3GetVarint(p, &iCol); + if( *p==0x02 ){ + if( bWritten==0 ){ + nOut += sqlite3Fts3PutVarint(&pOut[nOut], iDelta); + bWritten = 1; + } + pOut[nOut++] = 0x01; + nOut += sqlite3Fts3PutVarint(&pOut[nOut], iCol); + pOut[nOut++] = 0x02; + } + fts3ColumnlistCopy(0, &p); + } + if( bWritten ){ + pOut[nOut++] = 0x00; + } + + return nOut; } @@ -2338,6 +2619,7 @@ static int fts3SegReaderCursorAppend( */ static int fts3SegReaderCursor( Fts3Table *p, /* FTS3 table handle */ + int iLangid, /* Language id */ int iIndex, /* Index to search (from 0 to p->nIndex-1) */ int iLevel, /* Level of segments to scan */ const char *zTerm, /* Term to query for */ @@ -2366,7 +2648,7 @@ static int fts3SegReaderCursor( if( iLevel!=FTS3_SEGCURSOR_PENDING ){ if( rc==SQLITE_OK ){ - rc = sqlite3Fts3AllSegdirs(p, iIndex, iLevel, &pStmt); + rc = sqlite3Fts3AllSegdirs(p, iLangid, iIndex, iLevel, &pStmt); } while( rc==SQLITE_OK && SQLITE_ROW==(rc = sqlite3_step(pStmt)) ){ @@ -2389,7 +2671,9 @@ static int fts3SegReaderCursor( } rc = sqlite3Fts3SegReaderNew(pCsr->nSegment+1, - iStartBlock, iLeavesEndBlock, iEndBlock, zRoot, nRoot, &pSeg + (isPrefix==0 && isScan==0), + iStartBlock, iLeavesEndBlock, + iEndBlock, zRoot, nRoot, &pSeg ); if( rc!=SQLITE_OK ) goto finished; rc = fts3SegReaderCursorAppend(pCsr, pSeg); @@ -2409,6 +2693,7 @@ static int fts3SegReaderCursor( */ int sqlite3Fts3SegReaderCursor( Fts3Table *p, /* FTS3 table handle */ + int iLangid, /* Language-id to search */ int iIndex, /* Index to search (from 0 to p->nIndex-1) */ int iLevel, /* Level of segments to scan */ const char *zTerm, /* Term to query for */ @@ -2426,14 +2711,9 @@ int sqlite3Fts3SegReaderCursor( assert( FTS3_SEGCURSOR_ALL<0 && FTS3_SEGCURSOR_PENDING<0 ); assert( isPrefix==0 || isScan==0 ); - /* "isScan" is only set to true by the ft4aux module, an ordinary - ** full-text tables. */ - assert( isScan==0 || p->aIndex==0 ); - memset(pCsr, 0, sizeof(Fts3MultiSegReader)); - return fts3SegReaderCursor( - p, iIndex, iLevel, zTerm, nTerm, isPrefix, isScan, pCsr + p, iLangid, iIndex, iLevel, zTerm, nTerm, isPrefix, isScan, pCsr ); } @@ -2445,11 +2725,14 @@ int sqlite3Fts3SegReaderCursor( */ static int fts3SegReaderCursorAddZero( Fts3Table *p, /* FTS virtual table handle */ + int iLangid, const char *zTerm, /* Term to scan doclist of */ int nTerm, /* Number of bytes in zTerm */ Fts3MultiSegReader *pCsr /* Fts3MultiSegReader to modify */ ){ - return fts3SegReaderCursor(p, 0, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 0, 0,pCsr); + return fts3SegReaderCursor(p, + iLangid, 0, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 0, 0,pCsr + ); } /* @@ -2485,8 +2768,9 @@ static int fts3TermSegReaderCursor( for(i=1; bFound==0 && i<p->nIndex; i++){ if( p->aIndex[i].nPrefix==nTerm ){ bFound = 1; - rc = sqlite3Fts3SegReaderCursor( - p, i, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 0, 0, pSegcsr); + rc = sqlite3Fts3SegReaderCursor(p, pCsr->iLangid, + i, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 0, 0, pSegcsr + ); pSegcsr->bLookup = 1; } } @@ -2494,19 +2778,21 @@ static int fts3TermSegReaderCursor( for(i=1; bFound==0 && i<p->nIndex; i++){ if( p->aIndex[i].nPrefix==nTerm+1 ){ bFound = 1; - rc = sqlite3Fts3SegReaderCursor( - p, i, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 1, 0, pSegcsr + rc = sqlite3Fts3SegReaderCursor(p, pCsr->iLangid, + i, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 1, 0, pSegcsr ); if( rc==SQLITE_OK ){ - rc = fts3SegReaderCursorAddZero(p, zTerm, nTerm, pSegcsr); + rc = fts3SegReaderCursorAddZero( + p, pCsr->iLangid, zTerm, nTerm, pSegcsr + ); } } } } if( bFound==0 ){ - rc = sqlite3Fts3SegReaderCursor( - p, 0, FTS3_SEGCURSOR_ALL, zTerm, nTerm, isPrefix, 0, pSegcsr + rc = sqlite3Fts3SegReaderCursor(p, pCsr->iLangid, + 0, FTS3_SEGCURSOR_ALL, zTerm, nTerm, isPrefix, 0, pSegcsr ); pSegcsr->bLookup = !isPrefix; } @@ -2545,6 +2831,7 @@ static int fts3TermSelect( filter.flags = FTS3_SEGMENT_IGNORE_EMPTY | FTS3_SEGMENT_REQUIRE_POS | (pTok->isPrefix ? FTS3_SEGMENT_PREFIX : 0) + | (pTok->bFirst ? FTS3_SEGMENT_FIRST : 0) | (iColumn<p->nColumn ? FTS3_SEGMENT_COLUMN_FILTER : 0); filter.iCol = iColumn; filter.zTerm = pTok->z; @@ -2660,7 +2947,7 @@ static int fts3FilterMethod( UNUSED_PARAMETER(nVal); assert( idxNum>=0 && idxNum<=(FTS3_FULLTEXT_SEARCH+p->nColumn) ); - assert( nVal==0 || nVal==1 ); + assert( nVal==0 || nVal==1 || nVal==2 ); assert( (nVal==0)==(idxNum==FTS3_FULLSCAN_SEARCH) ); assert( p->pSegments==0 ); @@ -2685,8 +2972,11 @@ static int fts3FilterMethod( return SQLITE_NOMEM; } - rc = sqlite3Fts3ExprParse(p->pTokenizer, p->azColumn, p->nColumn, - iCol, zQuery, -1, &pCsr->pExpr + pCsr->iLangid = 0; + if( nVal==2 ) pCsr->iLangid = sqlite3_value_int(apVal[1]); + + rc = sqlite3Fts3ExprParse(p->pTokenizer, pCsr->iLangid, + p->azColumn, p->bFts4, p->nColumn, iCol, zQuery, -1, &pCsr->pExpr ); if( rc!=SQLITE_OK ){ if( rc==SQLITE_ERROR ){ @@ -2713,23 +3003,24 @@ static int fts3FilterMethod( ** row by docid. */ if( idxNum==FTS3_FULLSCAN_SEARCH ){ - const char *zSort = (pCsr->bDesc ? "DESC" : "ASC"); - const char *zTmpl = "SELECT %s FROM %Q.'%q_content' AS x ORDER BY docid %s"; - zSql = sqlite3_mprintf(zTmpl, p->zReadExprlist, p->zDb, p->zName, zSort); - }else{ - const char *zTmpl = "SELECT %s FROM %Q.'%q_content' AS x WHERE docid = ?"; - zSql = sqlite3_mprintf(zTmpl, p->zReadExprlist, p->zDb, p->zName); + zSql = sqlite3_mprintf( + "SELECT %s ORDER BY rowid %s", + p->zReadExprlist, (pCsr->bDesc ? "DESC" : "ASC") + ); + if( zSql ){ + rc = sqlite3_prepare_v2(p->db, zSql, -1, &pCsr->pStmt, 0); + sqlite3_free(zSql); + }else{ + rc = SQLITE_NOMEM; + } + }else if( idxNum==FTS3_DOCID_SEARCH ){ + rc = fts3CursorSeekStmt(pCsr, &pCsr->pStmt); + if( rc==SQLITE_OK ){ + rc = sqlite3_bind_value(pCsr->pStmt, 1, apVal[0]); + } } - if( !zSql ) return SQLITE_NOMEM; - rc = sqlite3_prepare_v2(p->db, zSql, -1, &pCsr->pStmt, 0); - sqlite3_free(zSql); if( rc!=SQLITE_OK ) return rc; - if( idxNum==FTS3_DOCID_SEARCH ){ - rc = sqlite3_bind_value(pCsr->pStmt, 1, apVal[0]); - if( rc!=SQLITE_OK ) return rc; - } - return fts3NextMethod(pCursor); } @@ -2756,10 +3047,17 @@ static int fts3RowidMethod(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ /* ** This is the xColumn method, called by SQLite to request a value from ** the row that the supplied cursor currently points to. +** +** If: +** +** (iCol < p->nColumn) -> The value of the iCol'th user column. +** (iCol == p->nColumn) -> Magic column with the same name as the table. +** (iCol == p->nColumn+1) -> Docid column +** (iCol == p->nColumn+2) -> Langid column */ static int fts3ColumnMethod( sqlite3_vtab_cursor *pCursor, /* Cursor to retrieve value from */ - sqlite3_context *pContext, /* Context for sqlite3_result_xxx() calls */ + sqlite3_context *pCtx, /* Context for sqlite3_result_xxx() calls */ int iCol /* Index of column to read value from */ ){ int rc = SQLITE_OK; /* Return Code */ @@ -2767,22 +3065,34 @@ static int fts3ColumnMethod( Fts3Table *p = (Fts3Table *)pCursor->pVtab; /* The column value supplied by SQLite must be in range. */ - assert( iCol>=0 && iCol<=p->nColumn+1 ); + assert( iCol>=0 && iCol<=p->nColumn+2 ); if( iCol==p->nColumn+1 ){ /* This call is a request for the "docid" column. Since "docid" is an ** alias for "rowid", use the xRowid() method to obtain the value. */ - sqlite3_result_int64(pContext, pCsr->iPrevId); + sqlite3_result_int64(pCtx, pCsr->iPrevId); }else if( iCol==p->nColumn ){ /* The extra column whose name is the same as the table. - ** Return a blob which is a pointer to the cursor. - */ - sqlite3_result_blob(pContext, &pCsr, sizeof(pCsr), SQLITE_TRANSIENT); + ** Return a blob which is a pointer to the cursor. */ + sqlite3_result_blob(pCtx, &pCsr, sizeof(pCsr), SQLITE_TRANSIENT); + }else if( iCol==p->nColumn+2 && pCsr->pExpr ){ + sqlite3_result_int64(pCtx, pCsr->iLangid); }else{ + /* The requested column is either a user column (one that contains + ** indexed data), or the language-id column. */ rc = fts3CursorSeek(0, pCsr); + if( rc==SQLITE_OK ){ - sqlite3_result_value(pContext, sqlite3_column_value(pCsr->pStmt, iCol+1)); + if( iCol==p->nColumn+2 ){ + int iLangid = 0; + if( p->zLanguageid ){ + iLangid = sqlite3_column_int(pCsr->pStmt, p->nColumn+1); + } + sqlite3_result_int(pCtx, iLangid); + }else if( sqlite3_data_count(pCsr->pStmt)>(iCol+1) ){ + sqlite3_result_value(pCtx, sqlite3_column_value(pCsr->pStmt, iCol+1)); + } } } @@ -2809,8 +3119,42 @@ static int fts3UpdateMethod( ** hash-table to the database. */ static int fts3SyncMethod(sqlite3_vtab *pVtab){ - int rc = sqlite3Fts3PendingTermsFlush((Fts3Table *)pVtab); - sqlite3Fts3SegmentsClose((Fts3Table *)pVtab); + + /* Following an incremental-merge operation, assuming that the input + ** segments are not completely consumed (the usual case), they are updated + ** in place to remove the entries that have already been merged. This + ** involves updating the leaf block that contains the smallest unmerged + ** entry and each block (if any) between the leaf and the root node. So + ** if the height of the input segment b-trees is N, and input segments + ** are merged eight at a time, updating the input segments at the end + ** of an incremental-merge requires writing (8*(1+N)) blocks. N is usually + ** small - often between 0 and 2. So the overhead of the incremental + ** merge is somewhere between 8 and 24 blocks. To avoid this overhead + ** dwarfing the actual productive work accomplished, the incremental merge + ** is only attempted if it will write at least 64 leaf blocks. Hence + ** nMinMerge. + ** + ** Of course, updating the input segments also involves deleting a bunch + ** of blocks from the segments table. But this is not considered overhead + ** as it would also be required by a crisis-merge that used the same input + ** segments. + */ + const u32 nMinMerge = 64; /* Minimum amount of incr-merge work to do */ + + Fts3Table *p = (Fts3Table*)pVtab; + int rc = sqlite3Fts3PendingTermsFlush(p); + + if( rc==SQLITE_OK && p->bAutoincrmerge==1 && p->nLeafAdd>(nMinMerge/16) ){ + int mxLevel = 0; /* Maximum relative level value in db */ + int A; /* Incr-merge parameter A */ + + rc = sqlite3Fts3MaxLevel(p, &mxLevel); + assert( rc==SQLITE_OK || mxLevel==0 ); + A = p->nLeafAdd * mxLevel; + A += (A/2); + if( A>(int)nMinMerge ) rc = sqlite3Fts3Incrmerge(p, A, 8); + } + sqlite3Fts3SegmentsClose(p); return rc; } @@ -2818,13 +3162,14 @@ static int fts3SyncMethod(sqlite3_vtab *pVtab){ ** Implementation of xBegin() method. This is a no-op. */ static int fts3BeginMethod(sqlite3_vtab *pVtab){ - TESTONLY( Fts3Table *p = (Fts3Table*)pVtab ); + Fts3Table *p = (Fts3Table*)pVtab; UNUSED_PARAMETER(pVtab); assert( p->pSegments==0 ); assert( p->nPendingData==0 ); assert( p->inTransaction!=1 ); TESTONLY( p->inTransaction = 1 ); TESTONLY( p->mxSavepoint = -1; ); + p->nLeafAdd = 0; return SQLITE_OK; } @@ -2865,7 +3210,7 @@ static int fts3RollbackMethod(sqlite3_vtab *pVtab){ */ static void fts3ReversePoslist(char *pStart, char **ppPoslist){ char *p = &(*ppPoslist)[-2]; - char c; + char c = 0; while( p>pStart && (c=*p--)==0 ); while( p>pStart && (*p & 0x80) | c ){ @@ -3074,15 +3419,22 @@ static int fts3RenameMethod( sqlite3 *db = p->db; /* Database connection */ int rc; /* Return Code */ + /* As it happens, the pending terms table is always empty here. This is + ** because an "ALTER TABLE RENAME TABLE" statement inside a transaction + ** always opens a savepoint transaction. And the xSavepoint() method + ** flushes the pending terms table. But leave the (no-op) call to + ** PendingTermsFlush() in in case that changes. + */ + assert( p->nPendingData==0 ); rc = sqlite3Fts3PendingTermsFlush(p); - if( rc!=SQLITE_OK ){ - return rc; + + if( p->zContentTbl==0 ){ + fts3DbExec(&rc, db, + "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';", + p->zDb, p->zName, zName + ); } - fts3DbExec(&rc, db, - "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';", - p->zDb, p->zName, zName - ); if( p->bHasDocsize ){ fts3DbExec(&rc, db, "ALTER TABLE %Q.'%q_docsize' RENAME TO '%q_docsize';", @@ -3112,11 +3464,15 @@ static int fts3RenameMethod( ** Flush the contents of the pending-terms table to disk. */ static int fts3SavepointMethod(sqlite3_vtab *pVtab, int iSavepoint){ + int rc = SQLITE_OK; UNUSED_PARAMETER(iSavepoint); assert( ((Fts3Table *)pVtab)->inTransaction ); assert( ((Fts3Table *)pVtab)->mxSavepoint < iSavepoint ); TESTONLY( ((Fts3Table *)pVtab)->mxSavepoint = iSavepoint ); - return fts3SyncMethod(pVtab); + if( ((Fts3Table *)pVtab)->bIgnoreSavepoint==0 ){ + rc = fts3SyncMethod(pVtab); + } + return rc; } /* @@ -3198,6 +3554,9 @@ static void hashDestroy(void *p){ */ void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule); +#ifdef SQLITE_ENABLE_FTS4_UNICODE61 +void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const**ppModule); +#endif #ifdef SQLITE_ENABLE_ICU void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule); #endif @@ -3213,12 +3572,19 @@ int sqlite3Fts3Init(sqlite3 *db){ Fts3Hash *pHash = 0; const sqlite3_tokenizer_module *pSimple = 0; const sqlite3_tokenizer_module *pPorter = 0; +#ifdef SQLITE_ENABLE_FTS4_UNICODE61 + const sqlite3_tokenizer_module *pUnicode = 0; +#endif #ifdef SQLITE_ENABLE_ICU const sqlite3_tokenizer_module *pIcu = 0; sqlite3Fts3IcuTokenizerModule(&pIcu); #endif +#ifdef SQLITE_ENABLE_FTS4_UNICODE61 + sqlite3Fts3UnicodeTokenizer(&pUnicode); +#endif + #ifdef SQLITE_TEST rc = sqlite3Fts3InitTerm(db); if( rc!=SQLITE_OK ) return rc; @@ -3242,6 +3608,10 @@ int sqlite3Fts3Init(sqlite3 *db){ if( rc==SQLITE_OK ){ if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple) || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter) + +#ifdef SQLITE_ENABLE_FTS4_UNICODE61 + || sqlite3Fts3HashInsert(pHash, "unicode61", 10, (void *)pUnicode) +#endif #ifdef SQLITE_ENABLE_ICU || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu)) #endif @@ -3441,21 +3811,20 @@ static int fts3EvalPhraseLoad( */ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ int iToken; /* Used to iterate through phrase tokens */ - int rc = SQLITE_OK; /* Return code */ char *aPoslist = 0; /* Position list for deferred tokens */ int nPoslist = 0; /* Number of bytes in aPoslist */ int iPrev = -1; /* Token number of previous deferred token */ assert( pPhrase->doclist.bFreeList==0 ); - for(iToken=0; rc==SQLITE_OK && iToken<pPhrase->nToken; iToken++){ + for(iToken=0; iToken<pPhrase->nToken; iToken++){ Fts3PhraseToken *pToken = &pPhrase->aToken[iToken]; Fts3DeferredToken *pDeferred = pToken->pDeferred; if( pDeferred ){ char *pList; int nList; - rc = sqlite3Fts3DeferredTokenList(pDeferred, &pList, &nList); + int rc = sqlite3Fts3DeferredTokenList(pDeferred, &pList, &nList); if( rc!=SQLITE_OK ) return rc; if( pList==0 ){ @@ -3477,7 +3846,7 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ fts3PoslistPhraseMerge(&aOut, iToken-iPrev, 0, 1, &p1, &p2); sqlite3_free(aPoslist); aPoslist = pList; - nPoslist = aOut - aPoslist; + nPoslist = (int)(aOut - aPoslist); if( nPoslist==0 ){ sqlite3_free(aPoslist); pPhrase->doclist.pList = 0; @@ -3521,7 +3890,7 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ pPhrase->doclist.pList = aOut; if( fts3PoslistPhraseMerge(&aOut, nDistance, 0, 1, &p1, &p2) ){ pPhrase->doclist.bFreeList = 1; - pPhrase->doclist.nList = (aOut - pPhrase->doclist.pList); + pPhrase->doclist.nList = (int)(aOut - pPhrase->doclist.pList); }else{ sqlite3_free(aOut); pPhrase->doclist.pList = 0; @@ -3556,6 +3925,7 @@ static int fts3EvalPhraseStart(Fts3Cursor *pCsr, int bOptOk, Fts3Phrase *p){ && p->nToken==1 && pFirst->pSegcsr && pFirst->pSegcsr->bLookup + && pFirst->bFirst==0 ){ /* Use the incremental approach. */ int iCol = (p->iColumn >= pTab->nColumn ? -1 : p->iColumn); @@ -3589,7 +3959,7 @@ void sqlite3Fts3DoclistPrev( int nDoclist, /* Length of aDoclist in bytes */ char **ppIter, /* IN/OUT: Iterator pointer */ sqlite3_int64 *piDocid, /* IN/OUT: Docid pointer */ - int *pnList, /* IN/OUT: List length pointer */ + int *pnList, /* OUT: List length pointer */ u8 *pbEof /* OUT: End-of-file flag */ ){ char *p = *ppIter; @@ -3616,7 +3986,7 @@ void sqlite3Fts3DoclistPrev( iMul = (bDescIdx ? -1 : 1); } - *pnList = pEnd - pNext; + *pnList = (int)(pEnd - pNext); *ppIter = pNext; *piDocid = iDocid; }else{ @@ -3630,13 +4000,48 @@ void sqlite3Fts3DoclistPrev( }else{ char *pSave = p; fts3ReversePoslist(aDoclist, &p); - *pnList = (pSave - p); + *pnList = (int)(pSave - p); } *ppIter = p; } } /* +** Iterate forwards through a doclist. +*/ +void sqlite3Fts3DoclistNext( + int bDescIdx, /* True if the doclist is desc */ + char *aDoclist, /* Pointer to entire doclist */ + int nDoclist, /* Length of aDoclist in bytes */ + char **ppIter, /* IN/OUT: Iterator pointer */ + sqlite3_int64 *piDocid, /* IN/OUT: Docid pointer */ + u8 *pbEof /* OUT: End-of-file flag */ +){ + char *p = *ppIter; + + assert( nDoclist>0 ); + assert( *pbEof==0 ); + assert( p || *piDocid==0 ); + assert( !p || (p>=aDoclist && p<=&aDoclist[nDoclist]) ); + + if( p==0 ){ + p = aDoclist; + p += sqlite3Fts3GetVarint(p, piDocid); + }else{ + fts3PoslistCopy(0, &p); + if( p>=&aDoclist[nDoclist] ){ + *pbEof = 1; + }else{ + sqlite3_int64 iVar; + p += sqlite3Fts3GetVarint(p, &iVar); + *piDocid += ((bDescIdx ? -1 : 1) * iVar); + } + } + + *ppIter = p; +} + +/* ** Attempt to move the phrase iterator to point to the next matching docid. ** If an error occurs, return an SQLite error code. Otherwise, return ** SQLITE_OK. @@ -3690,7 +4095,7 @@ static int fts3EvalPhraseNext( } pDL->pList = pIter; fts3PoslistCopy(0, &pIter); - pDL->nList = (pIter - pDL->pList); + pDL->nList = (int)(pIter - pDL->pList); /* pIter now points just past the 0x00 that terminates the position- ** list for document pDL->iDocid. However, if this position-list was @@ -3785,7 +4190,7 @@ static void fts3EvalTokenCosts( Fts3Expr ***ppOr, /* Write new OR root to *(*ppOr)++ */ int *pRc /* IN/OUT: Error code */ ){ - if( *pRc==SQLITE_OK && pExpr ){ + if( *pRc==SQLITE_OK ){ if( pExpr->eType==FTSQUERY_PHRASE ){ Fts3Phrase *pPhrase = pExpr->pPhrase; int i; @@ -3799,6 +4204,11 @@ static void fts3EvalTokenCosts( *pRc = sqlite3Fts3MsrOvfl(pCsr, pTC->pToken->pSegcsr, &pTC->nOvfl); } }else if( pExpr->eType!=FTSQUERY_NOT ){ + assert( pExpr->eType==FTSQUERY_OR + || pExpr->eType==FTSQUERY_AND + || pExpr->eType==FTSQUERY_NEAR + ); + assert( pExpr->pLeft && pExpr->pRight ); if( pExpr->eType==FTSQUERY_OR ){ pRoot = pExpr->pLeft; **ppOr = pRoot; @@ -3859,7 +4269,7 @@ static int fts3EvalAverageDocsize(Fts3Cursor *pCsr, int *pnPage){ } if( nDoc==0 || nByte==0 ){ sqlite3_reset(pStmt); - return SQLITE_CORRUPT_VTAB; + return FTS_CORRUPT_VTAB; } pCsr->nDoc = nDoc; @@ -3903,6 +4313,15 @@ static int fts3EvalSelectDeferred( int nMinEst = 0; /* The minimum count for any phrase so far. */ int nLoad4 = 1; /* (Phrases that will be loaded)^4. */ + /* Tokens are never deferred for FTS tables created using the content=xxx + ** option. The reason being that it is not guaranteed that the content + ** table actually contains the same data as the index. To prevent this from + ** causing any problems, the deferred token optimization is completely + ** disabled for content=xxx tables. */ + if( pTab->zContentTbl ){ + return SQLITE_OK; + } + /* Count the tokens in this AND/NEAR cluster. If none of the doclists ** associated with the tokens spill onto overflow pages, or if there is ** only 1 token, exit early. No tokens to defer in this case. */ @@ -3965,7 +4384,11 @@ static int fts3EvalSelectDeferred( fts3SegReaderCursorFree(pToken->pSegcsr); pToken->pSegcsr = 0; }else{ - nLoad4 = nLoad4*4; + /* Set nLoad4 to the value of (4^nOther) for the next iteration of the + ** for-loop. Except, limit the value to 2^24 to prevent it from + ** overflowing the 32-bit integer it is stored in. */ + if( ii<12 ) nLoad4 = nLoad4*4; + if( ii==0 || pTC->pPhrase->nToken>1 ){ /* Either this is the cheapest token in the entire query, or it is ** part of a multi-token phrase. Either way, the entire doclist will @@ -4013,7 +4436,8 @@ static int fts3EvalStart(Fts3Cursor *pCsr){ fts3EvalAllocateReaders(pCsr, pCsr->pExpr, &nToken, &nOr, &rc); /* Determine which, if any, tokens in the expression should be deferred. */ - if( rc==SQLITE_OK && nToken>1 && pTab->bHasStat ){ +#ifndef SQLITE_DISABLE_FTS4_DEFERRED + if( rc==SQLITE_OK && nToken>1 && pTab->bFts4 ){ Fts3TokenAndCost *aTC; Fts3Expr **apOr; aTC = (Fts3TokenAndCost *)sqlite3_malloc( @@ -4030,8 +4454,8 @@ static int fts3EvalStart(Fts3Cursor *pCsr){ Fts3Expr **ppOr = apOr; fts3EvalTokenCosts(pCsr, 0, pCsr->pExpr, &pTC, &ppOr, &rc); - nToken = pTC-aTC; - nOr = ppOr-apOr; + nToken = (int)(pTC-aTC); + nOr = (int)(ppOr-apOr); if( rc==SQLITE_OK ){ rc = fts3EvalSelectDeferred(pCsr, 0, aTC, nToken); @@ -4043,6 +4467,7 @@ static int fts3EvalStart(Fts3Cursor *pCsr){ sqlite3_free(aTC); } } +#endif fts3EvalStartReaders(pCsr, pCsr->pExpr, 1, &rc); return rc; @@ -4103,7 +4528,7 @@ static int fts3EvalNearTrim( &pOut, aTmp, nParam1, nParam2, paPoslist, &p2 ); if( res ){ - nNew = (pOut - pPhrase->doclist.pList) - 1; + nNew = (int)(pOut - pPhrase->doclist.pList) - 1; assert( pPhrase->doclist.pList[nNew]=='\0' ); assert( nNew<=pPhrase->doclist.nList && nNew>0 ); memset(&pPhrase->doclist.pList[nNew], 0, pPhrase->doclist.nList - nNew); @@ -4318,32 +4743,39 @@ static int fts3EvalNearTest(Fts3Expr *pExpr, int *pRc){ nTmp += p->pRight->pPhrase->doclist.nList; } nTmp += p->pPhrase->doclist.nList; - aTmp = sqlite3_malloc(nTmp*2); - if( !aTmp ){ - *pRc = SQLITE_NOMEM; + if( nTmp==0 ){ res = 0; }else{ - char *aPoslist = p->pPhrase->doclist.pList; - int nToken = p->pPhrase->nToken; + aTmp = sqlite3_malloc(nTmp*2); + if( !aTmp ){ + *pRc = SQLITE_NOMEM; + res = 0; + }else{ + char *aPoslist = p->pPhrase->doclist.pList; + int nToken = p->pPhrase->nToken; - for(p=p->pParent;res && p && p->eType==FTSQUERY_NEAR; p=p->pParent){ - Fts3Phrase *pPhrase = p->pRight->pPhrase; - int nNear = p->nNear; - res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase); - } - - aPoslist = pExpr->pRight->pPhrase->doclist.pList; - nToken = pExpr->pRight->pPhrase->nToken; - for(p=pExpr->pLeft; p && res; p=p->pLeft){ - int nNear = p->pParent->nNear; - Fts3Phrase *pPhrase = ( - p->eType==FTSQUERY_NEAR ? p->pRight->pPhrase : p->pPhrase - ); - res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase); + for(p=p->pParent;res && p && p->eType==FTSQUERY_NEAR; p=p->pParent){ + Fts3Phrase *pPhrase = p->pRight->pPhrase; + int nNear = p->nNear; + res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase); + } + + aPoslist = pExpr->pRight->pPhrase->doclist.pList; + nToken = pExpr->pRight->pPhrase->nToken; + for(p=pExpr->pLeft; p && res; p=p->pLeft){ + int nNear; + Fts3Phrase *pPhrase; + assert( p->pParent && p->pParent->pLeft==p ); + nNear = p->pParent->nNear; + pPhrase = ( + p->eType==FTSQUERY_NEAR ? p->pRight->pPhrase : p->pPhrase + ); + res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase); + } } - } - sqlite3_free(aTmp); + sqlite3_free(aTmp); + } } return res; @@ -4423,6 +4855,7 @@ static int fts3EvalTestExpr( break; default: { +#ifndef SQLITE_DISABLE_FTS4_DEFERRED if( pCsr->pDeferred && (pExpr->iDocid==pCsr->iPrevId || pExpr->bDeferred) ){ @@ -4434,7 +4867,9 @@ static int fts3EvalTestExpr( *pRc = fts3EvalDeferredPhrase(pCsr, pPhrase); bHit = (pPhrase->doclist.pList!=0); pExpr->iDocid = pCsr->iPrevId; - }else{ + }else +#endif + { bHit = (pExpr->bEof==0 && pExpr->iDocid==pCsr->iPrevId); } break; @@ -4770,26 +5205,87 @@ int sqlite3Fts3EvalPhraseStats( ** This function works regardless of whether or not the phrase is deferred, ** incremental, or neither. */ -char *sqlite3Fts3EvalPhrasePoslist( +int sqlite3Fts3EvalPhrasePoslist( Fts3Cursor *pCsr, /* FTS3 cursor object */ Fts3Expr *pExpr, /* Phrase to return doclist for */ - int iCol /* Column to return position list for */ + int iCol, /* Column to return position list for */ + char **ppOut /* OUT: Pointer to position list */ ){ Fts3Phrase *pPhrase = pExpr->pPhrase; Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; - char *pIter = pPhrase->doclist.pList; + char *pIter; int iThis; + sqlite3_int64 iDocid; + /* If this phrase is applies specifically to some column other than + ** column iCol, return a NULL pointer. */ + *ppOut = 0; assert( iCol>=0 && iCol<pTab->nColumn ); - if( !pIter - || pExpr->bEof - || pExpr->iDocid!=pCsr->iPrevId - || (pPhrase->iColumn<pTab->nColumn && pPhrase->iColumn!=iCol) - ){ - return 0; + if( (pPhrase->iColumn<pTab->nColumn && pPhrase->iColumn!=iCol) ){ + return SQLITE_OK; + } + + iDocid = pExpr->iDocid; + pIter = pPhrase->doclist.pList; + if( iDocid!=pCsr->iPrevId || pExpr->bEof ){ + int bDescDoclist = pTab->bDescIdx; /* For DOCID_CMP macro */ + int bOr = 0; + u8 bEof = 0; + Fts3Expr *p; + + /* Check if this phrase descends from an OR expression node. If not, + ** return NULL. Otherwise, the entry that corresponds to docid + ** pCsr->iPrevId may lie earlier in the doclist buffer. */ + for(p=pExpr->pParent; p; p=p->pParent){ + if( p->eType==FTSQUERY_OR ) bOr = 1; + } + if( bOr==0 ) return SQLITE_OK; + + /* This is the descendent of an OR node. In this case we cannot use + ** an incremental phrase. Load the entire doclist for the phrase + ** into memory in this case. */ + if( pPhrase->bIncr ){ + int rc = SQLITE_OK; + int bEofSave = pExpr->bEof; + fts3EvalRestart(pCsr, pExpr, &rc); + while( rc==SQLITE_OK && !pExpr->bEof ){ + fts3EvalNextRow(pCsr, pExpr, &rc); + if( bEofSave==0 && pExpr->iDocid==iDocid ) break; + } + pIter = pPhrase->doclist.pList; + assert( rc!=SQLITE_OK || pPhrase->bIncr==0 ); + if( rc!=SQLITE_OK ) return rc; + } + + if( pExpr->bEof ){ + pIter = 0; + iDocid = 0; + } + bEof = (pPhrase->doclist.nAll==0); + assert( bDescDoclist==0 || bDescDoclist==1 ); + assert( pCsr->bDesc==0 || pCsr->bDesc==1 ); + + if( pCsr->bDesc==bDescDoclist ){ + int dummy; + while( (pIter==0 || DOCID_CMP(iDocid, pCsr->iPrevId)>0 ) && bEof==0 ){ + sqlite3Fts3DoclistPrev( + bDescDoclist, pPhrase->doclist.aAll, pPhrase->doclist.nAll, + &pIter, &iDocid, &dummy, &bEof + ); + } + }else{ + while( (pIter==0 || DOCID_CMP(iDocid, pCsr->iPrevId)<0 ) && bEof==0 ){ + sqlite3Fts3DoclistNext( + bDescDoclist, pPhrase->doclist.aAll, pPhrase->doclist.nAll, + &pIter, &iDocid, &bEof + ); + } + } + + if( bEof || iDocid!=pCsr->iPrevId ) pIter = 0; } + if( pIter==0 ) return SQLITE_OK; - assert( pPhrase->doclist.nList>0 ); if( *pIter==0x01 ){ pIter++; pIter += sqlite3Fts3GetVarint32(pIter, &iThis); @@ -4803,7 +5299,8 @@ char *sqlite3Fts3EvalPhrasePoslist( pIter += sqlite3Fts3GetVarint32(pIter, &iThis); } - return ((iCol==iThis)?pIter:0); + *ppOut = ((iCol==iThis)?pIter:0); + return SQLITE_OK; } /* @@ -4826,6 +5323,16 @@ void sqlite3Fts3EvalPhraseCleanup(Fts3Phrase *pPhrase){ } } + +/* +** Return SQLITE_CORRUPT_VTAB. +*/ +#ifdef SQLITE_DEBUG +int sqlite3Fts3Corrupt(){ + return SQLITE_CORRUPT_VTAB; +} +#endif + #if !SQLITE_CORE /* ** Initialize API pointer table, if required. diff --git a/src/libtracker-fts/fts3.h b/src/libtracker-fts/fts3.h index 4a06f637f..b0346826d 100644 --- a/src/libtracker-fts/fts3.h +++ b/src/libtracker-fts/fts3.h @@ -20,9 +20,12 @@ extern "C" { #endif /* __cplusplus */ int sqlite3Fts3Init(sqlite3 *db); -int fts4_extension_init(sqlite3 *db, - char **pzErrMsg, - void *pApi); + +int fts4_extension_init( + sqlite3 *db, + char **pzErrMsg, + void *pApi +); #ifdef __cplusplus } /* extern "C" */ diff --git a/src/libtracker-fts/fts3Int.h b/src/libtracker-fts/fts3Int.h index ed8043adf..77ca4704e 100644 --- a/src/libtracker-fts/fts3Int.h +++ b/src/libtracker-fts/fts3Int.h @@ -67,6 +67,9 @@ extern const sqlite3_api_routines *sqlite3_api; #ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) #endif +#ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +#endif /* ** Maximum length of a varint encoded integer. The varint format is different @@ -121,7 +124,7 @@ extern const sqlite3_api_routines *sqlite3_api; # define NEVER(X) (0) #else # define ALWAYS(x) (x) -# define NEVER(X) (x) +# define NEVER(x) (x) #endif /* @@ -131,6 +134,7 @@ typedef unsigned char u8; /* 1-byte (or larger) unsigned integer */ typedef short int i16; /* 2-byte (or larger) signed integer */ typedef unsigned int u32; /* 4-byte unsigned integer */ typedef sqlite3_uint64 u64; /* 8-byte unsigned integer */ +typedef sqlite3_int64 i64; /* 8-byte signed integer */ /* ** Macro used to suppress compiler warnings for unused parameters. @@ -157,6 +161,13 @@ typedef sqlite3_uint64 u64; /* 8-byte unsigned integer */ #endif /* SQLITE_AMALGAMATION */ +#ifdef SQLITE_DEBUG +int sqlite3Fts3Corrupt(void); +# define FTS_CORRUPT_VTAB sqlite3Fts3Corrupt() +#else +# define FTS_CORRUPT_VTAB SQLITE_CORRUPT_VTAB +#endif + typedef struct Fts3Table Fts3Table; typedef struct Fts3Cursor Fts3Cursor; typedef struct Fts3Expr Fts3Expr; @@ -184,36 +195,45 @@ struct Fts3Table { int nColumn; /* number of named columns in virtual table */ char **azColumn; /* column names. malloced */ sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ + char *zContentTbl; /* content=xxx option, or NULL */ + char *zLanguageid; /* languageid=xxx option, or NULL */ + u8 bAutoincrmerge; /* True if automerge=1 */ + u32 nLeafAdd; /* Number of leaf blocks added this trans */ /* Precompiled statements used by the implementation. Each of these ** statements is run and reset within a single virtual table API call. */ - sqlite3_stmt *aStmt[27]; + sqlite3_stmt *aStmt[37]; char *zReadExprlist; char *zWriteExprlist; int nNodeSize; /* Soft limit for node size */ + u8 bFts4; /* True for FTS4, false for FTS3 */ u8 bHasStat; /* True if %_stat table exists */ u8 bHasDocsize; /* True if %_docsize table exists */ u8 bDescIdx; /* True if doclists are in reverse order */ + u8 bIgnoreSavepoint; /* True to ignore xSavepoint invocations */ int nPgsz; /* Page size for host database */ char *zSegmentsTbl; /* Name of %_segments table */ sqlite3_blob *pSegments; /* Blob handle open on %_segments table */ - /* TODO: Fix the first paragraph of this comment. - ** - ** The following hash table is used to buffer pending index updates during - ** transactions. Variable nPendingData estimates the memory size of the - ** pending data, including hash table overhead, but not malloc overhead. - ** When nPendingData exceeds nMaxPendingData, the buffer is flushed - ** automatically. Variable iPrevDocid is the docid of the most recently - ** inserted record. + /* + ** The following array of hash tables is used to buffer pending index + ** updates during transactions. All pending updates buffered at any one + ** time must share a common language-id (see the FTS4 langid= feature). + ** The current language id is stored in variable iPrevLangid. ** ** A single FTS4 table may have multiple full-text indexes. For each index ** there is an entry in the aIndex[] array. Index 0 is an index of all the ** terms that appear in the document set. Each subsequent index in aIndex[] ** is an index of prefixes of a specific length. + ** + ** Variable nPendingData contains an estimate the memory consumed by the + ** pending data structures, including hash table overhead, but not including + ** malloc overhead. When nPendingData exceeds nMaxPendingData, all hash + ** tables are flushed to disk. Variable iPrevDocid is the docid of the most + ** recently inserted record. */ int nIndex; /* Size of aIndex[] */ struct Fts3Index { @@ -223,12 +243,13 @@ struct Fts3Table { int nMaxPendingData; /* Max pending data before flush to disk */ int nPendingData; /* Current bytes of pending data */ sqlite_int64 iPrevDocid; /* Docid of most recently inserted document */ + int iPrevLangid; /* Langid of recently inserted document */ -#if defined(SQLITE_DEBUG) +#if defined(SQLITE_DEBUG) || defined(SQLITE_COVERAGE_TEST) /* State variables used for validating that the transaction control ** methods of the virtual table are called at appropriate times. These - ** values do not contribution to the FTS computation; they are used for - ** verifying the SQLite core. + ** values do not contribute to FTS functionality; they are used for + ** verifying the operation of the SQLite core. */ int inTransaction; /* True after xBegin but before xCommit/xRollback */ int mxSavepoint; /* Largest valid xSavepoint integer */ @@ -247,6 +268,7 @@ struct Fts3Cursor { u8 isRequireSeek; /* True if must seek pStmt to %_content row */ sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ Fts3Expr *pExpr; /* Parsed MATCH query string */ + int iLangid; /* Language being queried for */ int nPhrase; /* Number of matchable phrases in query */ Fts3DeferredToken *pDeferred; /* Deferred search tokens, if any */ sqlite3_int64 iPrevId; /* Previous id read from aDoclist */ @@ -309,6 +331,7 @@ struct Fts3PhraseToken { char *z; /* Text of the token */ int n; /* Number of bytes in buffer z */ int isPrefix; /* True if token ends with a "*" character */ + int bFirst; /* True if token must appear at position 0 */ /* Variables above this point are populated when the expression is ** parsed (by code in fts3_expr.c). Below this point the variables are @@ -392,23 +415,34 @@ int sqlite3Fts3UpdateMethod(sqlite3_vtab*,int,sqlite3_value**,sqlite3_int64*); int sqlite3Fts3PendingTermsFlush(Fts3Table *); void sqlite3Fts3PendingTermsClear(Fts3Table *); int sqlite3Fts3Optimize(Fts3Table *); -int sqlite3Fts3SegReaderNew(int, sqlite3_int64, +int sqlite3Fts3SegReaderNew(int, int, sqlite3_int64, sqlite3_int64, sqlite3_int64, const char *, int, Fts3SegReader**); int sqlite3Fts3SegReaderPending( Fts3Table*,int,const char*,int,int,Fts3SegReader**); void sqlite3Fts3SegReaderFree(Fts3SegReader *); -int sqlite3Fts3AllSegdirs(Fts3Table*, int, int, sqlite3_stmt **); +int sqlite3Fts3AllSegdirs(Fts3Table*, int, int, int, sqlite3_stmt **); int sqlite3Fts3ReadLock(Fts3Table *); int sqlite3Fts3ReadBlock(Fts3Table*, sqlite3_int64, char **, int*, int*); int sqlite3Fts3SelectDoctotal(Fts3Table *, sqlite3_stmt **); int sqlite3Fts3SelectDocsize(Fts3Table *, sqlite3_int64, sqlite3_stmt **); +#ifndef SQLITE_DISABLE_FTS4_DEFERRED void sqlite3Fts3FreeDeferredTokens(Fts3Cursor *); int sqlite3Fts3DeferToken(Fts3Cursor *, Fts3PhraseToken *, int); int sqlite3Fts3CacheDeferredDoclists(Fts3Cursor *); void sqlite3Fts3FreeDeferredDoclists(Fts3Cursor *); +int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *); +#else +# define sqlite3Fts3FreeDeferredTokens(x) +# define sqlite3Fts3DeferToken(x,y,z) SQLITE_OK +# define sqlite3Fts3CacheDeferredDoclists(x) SQLITE_OK +# define sqlite3Fts3FreeDeferredDoclists(x) +# define sqlite3Fts3DeferredTokenList(x,y,z) SQLITE_OK +#endif + void sqlite3Fts3SegmentsClose(Fts3Table *); +int sqlite3Fts3MaxLevel(Fts3Table *, int *); /* Special values interpreted by sqlite3SegReaderCursor() */ #define FTS3_SEGCURSOR_PENDING -1 @@ -418,8 +452,8 @@ int sqlite3Fts3SegReaderStart(Fts3Table*, Fts3MultiSegReader*, Fts3SegFilter*); int sqlite3Fts3SegReaderStep(Fts3Table *, Fts3MultiSegReader *); void sqlite3Fts3SegReaderFinish(Fts3MultiSegReader *); -int sqlite3Fts3SegReaderCursor( - Fts3Table *, int, int, const char *, int, int, int, Fts3MultiSegReader *); +int sqlite3Fts3SegReaderCursor(Fts3Table *, + int, int, int, const char *, int, int, int, Fts3MultiSegReader *); /* Flags allowed as part of the 4th argument to SegmentReaderIterate() */ #define FTS3_SEGMENT_REQUIRE_POS 0x00000001 @@ -427,6 +461,7 @@ int sqlite3Fts3SegReaderCursor( #define FTS3_SEGMENT_COLUMN_FILTER 0x00000004 #define FTS3_SEGMENT_PREFIX 0x00000008 #define FTS3_SEGMENT_SCAN 0x00000010 +#define FTS3_SEGMENT_FIRST 0x00000020 /* Type passed as 4th argument to SegmentReaderIterate() */ struct Fts3SegFilter { @@ -459,6 +494,8 @@ struct Fts3MultiSegReader { int nDoclist; /* Size of aDoclist[] in bytes */ }; +int sqlite3Fts3Incrmerge(Fts3Table*,int,int); + /* fts3.c */ int sqlite3Fts3PutVarint(char *, sqlite3_int64); int sqlite3Fts3GetVarint(const char *, sqlite_int64 *); @@ -466,8 +503,9 @@ int sqlite3Fts3GetVarint32(const char *, int *); int sqlite3Fts3VarintLen(sqlite3_uint64); void sqlite3Fts3Dequote(char *); void sqlite3Fts3DoclistPrev(int,char*,int,char**,sqlite3_int64*,int*,u8*); - int sqlite3Fts3EvalPhraseStats(Fts3Cursor *, Fts3Expr *, u32 *); +int sqlite3Fts3FirstFilter(sqlite3_int64, char *, int, char *); +void sqlite3Fts3CreateStatTable(int*, Fts3Table*); /* fts3_tokenizer.c */ const char *sqlite3Fts3NextToken(const char *, int *); @@ -485,8 +523,8 @@ void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *, void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *); /* fts3_expr.c */ -int sqlite3Fts3ExprParse(sqlite3_tokenizer *, - char **, int, int, const char *, int, Fts3Expr ** +int sqlite3Fts3ExprParse(sqlite3_tokenizer *, int, + char **, int, int, int, const char *, int, Fts3Expr ** ); void sqlite3Fts3ExprFree(Fts3Expr *); #ifdef SQLITE_TEST @@ -494,6 +532,10 @@ int sqlite3Fts3ExprInitTestInterface(sqlite3 *db); int sqlite3Fts3InitTerm(sqlite3 *db); #endif +int sqlite3Fts3OpenTokenizer(sqlite3_tokenizer *, int, const char *, int, + sqlite3_tokenizer_cursor ** +); + /* fts3_aux.c */ int sqlite3Fts3InitAux(sqlite3 *db); @@ -503,11 +545,16 @@ int sqlite3Fts3MsrIncrStart( Fts3Table*, Fts3MultiSegReader*, int, const char*, int); int sqlite3Fts3MsrIncrNext( Fts3Table *, Fts3MultiSegReader *, sqlite3_int64 *, char **, int *); -char *sqlite3Fts3EvalPhrasePoslist(Fts3Cursor *, Fts3Expr *, int iCol); +int sqlite3Fts3EvalPhrasePoslist(Fts3Cursor *, Fts3Expr *, int iCol, char **); int sqlite3Fts3MsrOvfl(Fts3Cursor *, Fts3MultiSegReader *, int *); int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr); -int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *); +/* fts3_unicode2.c (functions generated by parsing unicode text files) */ +#ifdef SQLITE_ENABLE_FTS4_UNICODE61 +int sqlite3FtsUnicodeFold(int, int); +int sqlite3FtsUnicodeIsalnum(int); +int sqlite3FtsUnicodeIsdiacritic(int); +#endif #endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */ #endif /* _FTSINT_H */ diff --git a/src/libtracker-fts/fts3_aux.c b/src/libtracker-fts/fts3_aux.c index ada85d796..a2bff2e1d 100644 --- a/src/libtracker-fts/fts3_aux.c +++ b/src/libtracker-fts/fts3_aux.c @@ -79,9 +79,9 @@ static int fts3auxConnectMethod( } zDb = argv[1]; - nDb = strlen(zDb); + nDb = (int)strlen(zDb); zFts3 = argv[3]; - nFts3 = strlen(zFts3); + nFts3 = (int)strlen(zFts3); rc = sqlite3_declare_vtab(db, FTS3_TERMS_SCHEMA); if( rc!=SQLITE_OK ) return rc; @@ -376,7 +376,7 @@ static int fts3auxFilterMethod( if( pCsr->zStop==0 ) return SQLITE_NOMEM; } - rc = sqlite3Fts3SegReaderCursor(pFts3, 0, FTS3_SEGCURSOR_ALL, + rc = sqlite3Fts3SegReaderCursor(pFts3, 0, 0, FTS3_SEGCURSOR_ALL, pCsr->filter.zTerm, pCsr->filter.nTerm, 0, isScan, &pCsr->csr ); if( rc==SQLITE_OK ){ diff --git a/src/libtracker-fts/fts3_expr.c b/src/libtracker-fts/fts3_expr.c index 7eb2962d4..7612789de 100644 --- a/src/libtracker-fts/fts3_expr.c +++ b/src/libtracker-fts/fts3_expr.c @@ -92,7 +92,9 @@ int sqlite3_fts3_enable_parentheses = 0; typedef struct ParseContext ParseContext; struct ParseContext { sqlite3_tokenizer *pTokenizer; /* Tokenizer module */ + int iLangid; /* Language id used with tokenizer */ const char **azCol; /* Array of column names for fts3 table */ + int bFts4; /* True to allow FTS4-only syntax */ int nCol; /* Number of entries in azCol[] */ int iDefaultCol; /* Default column to query */ int isNot; /* True if getNextNode() sees a unary - */ @@ -126,6 +128,33 @@ static void *fts3MallocZero(int nByte){ return pRet; } +int sqlite3Fts3OpenTokenizer( + sqlite3_tokenizer *pTokenizer, + int iLangid, + const char *z, + int n, + sqlite3_tokenizer_cursor **ppCsr +){ + sqlite3_tokenizer_module const *pModule = pTokenizer->pModule; + sqlite3_tokenizer_cursor *pCsr = 0; + int rc; + + rc = pModule->xOpen(pTokenizer, z, n, &pCsr); + assert( rc==SQLITE_OK || pCsr==0 ); + if( rc==SQLITE_OK ){ + pCsr->pTokenizer = pTokenizer; + if( pModule->iVersion>=1 ){ + rc = pModule->xLanguageid(pCsr, iLangid); + if( rc!=SQLITE_OK ){ + pModule->xClose(pCsr); + pCsr = 0; + } + } + } + *ppCsr = pCsr; + return rc; +} + /* ** Extract the next token from buffer z (length n) using the tokenizer @@ -153,15 +182,13 @@ static int getNextToken( Fts3Expr *pRet = 0; int nConsumed = 0; - rc = pModule->xOpen(pTokenizer, z, n, &pCursor); + rc = sqlite3Fts3OpenTokenizer(pTokenizer, pParse->iLangid, z, n, &pCursor); if( rc==SQLITE_OK ){ const char *zToken; - int nToken, iStart, iEnd, iPosition; + int nToken = 0, iStart = 0, iEnd = 0, iPosition = 0; int nByte; /* total space to allocate */ - pCursor->pTokenizer = pTokenizer; rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition); - if( rc==SQLITE_OK ){ nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken; pRet = (Fts3Expr *)fts3MallocZero(nByte); @@ -180,9 +207,21 @@ static int getNextToken( pRet->pPhrase->aToken[0].isPrefix = 1; iEnd++; } - if( !sqlite3_fts3_enable_parentheses && iStart>0 && z[iStart-1]=='-' ){ - pParse->isNot = 1; + + while( 1 ){ + if( !sqlite3_fts3_enable_parentheses + && iStart>0 && z[iStart-1]=='-' + ){ + pParse->isNot = 1; + iStart--; + }else if( pParse->bFts4 && iStart>0 && z[iStart-1]=='^' ){ + pRet->pPhrase->aToken[0].bFirst = 1; + iStart--; + }else{ + break; + } } + } nConsumed = iEnd; } @@ -255,13 +294,13 @@ static int getNextString( ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase ** structures. */ - rc = pModule->xOpen(pTokenizer, zInput, nInput, &pCursor); + rc = sqlite3Fts3OpenTokenizer( + pTokenizer, pParse->iLangid, zInput, nInput, &pCursor); if( rc==SQLITE_OK ){ int ii; - pCursor->pTokenizer = pTokenizer; for(ii=0; rc==SQLITE_OK; ii++){ const char *zByte; - int nByte, iBegin, iEnd, iPos; + int nByte = 0, iBegin = 0, iEnd = 0, iPos = 0; rc = pModule->xNext(pCursor, &zByte, &nByte, &iBegin, &iEnd, &iPos); if( rc==SQLITE_OK ){ Fts3PhraseToken *pToken; @@ -281,6 +320,7 @@ static int getNextString( pToken->n = nByte; pToken->isPrefix = (iEnd<nInput && zInput[iEnd]=='*'); + pToken->bFirst = (iBegin>0 && zInput[iBegin-1]=='^'); nToken = ii+1; } } @@ -302,8 +342,12 @@ static int getNextString( p->pPhrase->nToken = nToken; zBuf = (char *)&p->pPhrase->aToken[nToken]; - memcpy(zBuf, zTemp, nTemp); - sqlite3_free(zTemp); + if( zTemp ){ + memcpy(zBuf, zTemp, nTemp); + sqlite3_free(zTemp); + }else{ + assert( nTemp==0 ); + } for(jj=0; jj<p->pPhrase->nToken; jj++){ p->pPhrase->aToken[jj].z = zBuf; @@ -727,7 +771,9 @@ exprparse_out: */ int sqlite3Fts3ExprParse( sqlite3_tokenizer *pTokenizer, /* Tokenizer module */ + int iLangid, /* Language id for tokenizer */ char **azCol, /* Array of column names for fts3 table */ + int bFts4, /* True to allow FTS4-only syntax */ int nCol, /* Number of entries in azCol[] */ int iDefaultCol, /* Default column to query */ const char *z, int n, /* Text of MATCH query */ @@ -736,11 +782,14 @@ int sqlite3Fts3ExprParse( int nParsed; int rc; ParseContext sParse; + + memset(&sParse, 0, sizeof(ParseContext)); sParse.pTokenizer = pTokenizer; + sParse.iLangid = iLangid; sParse.azCol = (const char **)azCol; sParse.nCol = nCol; sParse.iDefaultCol = iDefaultCol; - sParse.nNest = 0; + sParse.bFts4 = bFts4; if( z==0 ){ *ppExpr = 0; return SQLITE_OK; @@ -930,7 +979,7 @@ static void fts3ExprTest( } rc = sqlite3Fts3ExprParse( - pTokenizer, azCol, nCol, nCol, zExpr, nExpr, &pExpr + pTokenizer, 0, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr ); if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){ sqlite3_result_error(context, "Error parsing expression", -1); diff --git a/src/libtracker-fts/fts3_icu.c b/src/libtracker-fts/fts3_icu.c index a10a55d67..52df8c7d8 100644 --- a/src/libtracker-fts/fts3_icu.c +++ b/src/libtracker-fts/fts3_icu.c @@ -110,13 +110,16 @@ static int icuOpen( *ppCursor = 0; - if( nInput<0 ){ + if( zInput==0 ){ + nInput = 0; + zInput = ""; + }else if( nInput<0 ){ nInput = strlen(zInput); } nChar = nInput+1; pCsr = (IcuCursor *)sqlite3_malloc( sizeof(IcuCursor) + /* IcuCursor */ - nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ + ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */ (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ ); if( !pCsr ){ @@ -124,7 +127,7 @@ static int icuOpen( } memset(pCsr, 0, sizeof(IcuCursor)); pCsr->aChar = (UChar *)&pCsr[1]; - pCsr->aOffset = (int *)&pCsr->aChar[nChar]; + pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3]; pCsr->aOffset[iOut] = iInput; U8_NEXT(zInput, iInput, nInput, c); @@ -196,7 +199,7 @@ static int icuNext( while( iStart<iEnd ){ int iWhite = iStart; - U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); + U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); if( u_isspace(c) ){ iStart = iWhite; }else{ diff --git a/src/libtracker-fts/fts3_porter.c b/src/libtracker-fts/fts3_porter.c index 148c57008..579745b85 100644 --- a/src/libtracker-fts/fts3_porter.c +++ b/src/libtracker-fts/fts3_porter.c @@ -40,7 +40,7 @@ typedef struct porter_tokenizer { } porter_tokenizer; /* -** Class derived from sqlit3_tokenizer_cursor +** Class derived from sqlite3_tokenizer_cursor */ typedef struct porter_tokenizer_cursor { sqlite3_tokenizer_cursor base; @@ -630,6 +630,7 @@ static const sqlite3_tokenizer_module porterTokenizerModule = { porterOpen, porterClose, porterNext, + 0 }; /* diff --git a/src/libtracker-fts/fts3_snippet.c b/src/libtracker-fts/fts3_snippet.c index b569eb131..4bee014dc 100644 --- a/src/libtracker-fts/fts3_snippet.c +++ b/src/libtracker-fts/fts3_snippet.c @@ -360,23 +360,27 @@ static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){ SnippetIter *p = (SnippetIter *)ctx; SnippetPhrase *pPhrase = &p->aPhrase[iPhrase]; char *pCsr; + int rc; pPhrase->nToken = pExpr->pPhrase->nToken; - - pCsr = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol); + rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pCsr); + assert( rc==SQLITE_OK || pCsr==0 ); if( pCsr ){ int iFirst = 0; pPhrase->pList = pCsr; fts3GetDeltaPosition(&pCsr, &iFirst); + assert( iFirst>=0 ); pPhrase->pHead = pCsr; pPhrase->pTail = pCsr; pPhrase->iHead = iFirst; pPhrase->iTail = iFirst; }else{ - assert( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 ); + assert( rc!=SQLITE_OK || ( + pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 + )); } - return SQLITE_OK; + return rc; } /* @@ -531,6 +535,7 @@ static int fts3StringAppend( */ static int fts3SnippetShift( Fts3Table *pTab, /* FTS3 table snippet comes from */ + int iLangid, /* Language id to use in tokenizing */ int nSnippet, /* Number of tokens desired for snippet */ const char *zDoc, /* Document text to extract snippet from */ int nDoc, /* Size of buffer zDoc in bytes */ @@ -566,13 +571,12 @@ static int fts3SnippetShift( /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired) ** or more tokens in zDoc/nDoc. */ - rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); + rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC); if( rc!=SQLITE_OK ){ return rc; } - pC->pTokenizer = pTab->pTokenizer; while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){ - const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3; + const char *ZDUMMY; int DUMMY1 = 0, DUMMY2 = 0, DUMMY3 = 0; rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent); } pMod->xClose(pC); @@ -616,8 +620,6 @@ static int fts3SnippetText( int iCol = pFragment->iCol+1; /* Query column to extract text from */ sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */ sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */ - const char *ZDUMMY; /* Dummy argument used with tokenizer */ - int DUMMY1; /* Dummy argument used with tokenizer */ zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol); if( zDoc==0 ){ @@ -630,17 +632,29 @@ static int fts3SnippetText( /* Open a token cursor on the document. */ pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; - rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); + rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC); if( rc!=SQLITE_OK ){ return rc; } - pC->pTokenizer = pTab->pTokenizer; while( rc==SQLITE_OK ){ - int iBegin; /* Offset in zDoc of start of token */ - int iFin; /* Offset in zDoc of end of token */ - int isHighlight; /* True for highlighted terms */ - + const char *ZDUMMY; /* Dummy argument used with tokenizer */ + int DUMMY1 = -1; /* Dummy argument used with tokenizer */ + int iBegin = 0; /* Offset in zDoc of start of token */ + int iFin = 0; /* Offset in zDoc of end of token */ + int isHighlight = 0; /* True for highlighted terms */ + + /* Variable DUMMY1 is initialized to a negative value above. Elsewhere + ** in the FTS code the variable that the third argument to xNext points to + ** is initialized to zero before the first (*but not necessarily + ** subsequent*) call to xNext(). This is done for a particular application + ** that needs to know whether or not the tokenizer is being used for + ** snippet generation or for some other purpose. + ** + ** Extreme care is required when writing code to depend on this + ** initialization. It is not a documented part of the tokenizer interface. + ** If a tokenizer is used directly by any code outside of FTS, this + ** convention might not be respected. */ rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent); if( rc!=SQLITE_OK ){ if( rc==SQLITE_DONE ){ @@ -656,7 +670,9 @@ static int fts3SnippetText( if( !isShiftDone ){ int n = nDoc - iBegin; - rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask); + rc = fts3SnippetShift( + pTab, pCsr->iLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask + ); isShiftDone = 1; /* Now that the shift has been done, check if the initial "..." are @@ -768,13 +784,14 @@ static int fts3ExprLocalHitsCb( int iPhrase, /* Phrase number */ void *pCtx /* Pointer to MatchInfo structure */ ){ + int rc = SQLITE_OK; MatchInfo *p = (MatchInfo *)pCtx; int iStart = iPhrase * p->nCol * 3; int i; - for(i=0; i<p->nCol; i++){ + for(i=0; i<p->nCol && rc==SQLITE_OK; i++){ char *pCsr; - pCsr = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i); + rc = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i, &pCsr); if( pCsr ){ p->aMatchinfo[iStart+i*3] = fts3ColumnlistCount(&pCsr); }else{ @@ -782,7 +799,7 @@ static int fts3ExprLocalHitsCb( } } - return SQLITE_OK; + return rc; } static int fts3MatchinfoCheck( @@ -792,8 +809,8 @@ static int fts3MatchinfoCheck( ){ if( (cArg==FTS3_MATCHINFO_NPHRASE) || (cArg==FTS3_MATCHINFO_NCOL) - || (cArg==FTS3_MATCHINFO_NDOC && pTab->bHasStat) - || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bHasStat) + || (cArg==FTS3_MATCHINFO_NDOC && pTab->bFts4) + || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bFts4) || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize) || (cArg==FTS3_MATCHINFO_LCS) || (cArg==FTS3_MATCHINFO_HITS) @@ -848,7 +865,7 @@ static int fts3MatchinfoSelectDoctotal( a = sqlite3_column_blob(pStmt, 0); a += sqlite3Fts3GetVarint(a, &nDoc); - if( nDoc==0 ) return SQLITE_CORRUPT_VTAB; + if( nDoc==0 ) return FTS_CORRUPT_VTAB; *pnDoc = (u32)nDoc; if( paLen ) *paLen = a; @@ -943,8 +960,10 @@ static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){ int nLive = 0; /* Number of iterators in aIter not at EOF */ for(i=0; i<pInfo->nPhrase; i++){ + int rc; LcsIterator *pIt = &aIter[i]; - pIt->pRead = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol); + rc = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol, &pIt->pRead); + if( rc!=SQLITE_OK ) return rc; if( pIt->pRead ){ pIt->iPos = pIt->iPosOffset; fts3LcsIteratorAdvance(&aIter[i]); @@ -1296,9 +1315,10 @@ static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){ int iTerm; /* For looping through nTerm phrase terms */ char *pList; /* Pointer to position list for phrase */ int iPos = 0; /* First position in position-list */ + int rc; UNUSED_PARAMETER(iPhrase); - pList = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol); + rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pList); nTerm = pExpr->pPhrase->nToken; if( pList ){ fts3GetDeltaPosition(&pList, &iPos); @@ -1312,7 +1332,7 @@ static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){ pT->iPos = iPos; } - return SQLITE_OK; + return rc; } /* @@ -1324,8 +1344,6 @@ void sqlite3Fts3Offsets( ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule; - const char *ZDUMMY; /* Dummy argument used with xNext() */ - int NDUMMY; /* Dummy argument used with xNext() */ int rc; /* Return Code */ int nToken; /* Number of tokens in query */ int iCol; /* Column currently being processed */ @@ -1358,9 +1376,11 @@ void sqlite3Fts3Offsets( */ for(iCol=0; iCol<pTab->nColumn; iCol++){ sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */ - int iStart; - int iEnd; - int iCurrent; + const char *ZDUMMY; /* Dummy argument used with xNext() */ + int NDUMMY = 0; /* Dummy argument used with xNext() */ + int iStart = 0; + int iEnd = 0; + int iCurrent = 0; const char *zDoc; int nDoc; @@ -1389,9 +1409,10 @@ void sqlite3Fts3Offsets( } /* Initialize a tokenizer iterator to iterate through column iCol. */ - rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); + rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, + zDoc, nDoc, &pC + ); if( rc!=SQLITE_OK ) goto offsets_out; - pC->pTokenizer = pTab->pTokenizer; rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent); while( rc==SQLITE_OK ){ @@ -1409,7 +1430,7 @@ void sqlite3Fts3Offsets( if( !pTerm ){ /* All offsets for this column have been gathered. */ - break; + rc = SQLITE_DONE; }else{ assert( iCurrent<=iMinPos ); if( 0==(0xFE&*pTerm->pList) ){ @@ -1426,8 +1447,8 @@ void sqlite3Fts3Offsets( "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart ); rc = fts3StringAppend(&res, aBuffer, -1); - }else if( rc==SQLITE_DONE ){ - rc = SQLITE_CORRUPT_VTAB; + }else if( rc==SQLITE_DONE && pTab->zContentTbl==0 ){ + rc = FTS_CORRUPT_VTAB; } } } diff --git a/src/libtracker-fts/fts3_term.c b/src/libtracker-fts/fts3_term.c index d3eb690bd..c49d5cb65 100644 --- a/src/libtracker-fts/fts3_term.c +++ b/src/libtracker-fts/fts3_term.c @@ -73,6 +73,7 @@ static int fts3termConnectMethod( Fts3termTable *p; /* Virtual table object to return */ int iIndex = 0; + UNUSED_PARAMETER(pCtx); if( argc==5 ){ iIndex = atoi(argv[4]); argc--; @@ -87,9 +88,9 @@ static int fts3termConnectMethod( } zDb = argv[1]; - nDb = strlen(zDb); + nDb = (int)strlen(zDb); zFts3 = argv[3]; - nFts3 = strlen(zFts3); + nFts3 = (int)strlen(zFts3); rc = sqlite3_declare_vtab(db, FTS3_TERMS_SCHEMA); if( rc!=SQLITE_OK ) return rc; @@ -231,12 +232,12 @@ static int fts3termNextMethod(sqlite3_vtab_cursor *pCursor){ if( v==1 ){ pCsr->pNext += sqlite3Fts3GetVarint(pCsr->pNext, &v); - pCsr->iCol += v; + pCsr->iCol += (int)v; pCsr->iPos = 0; pCsr->pNext += sqlite3Fts3GetVarint(pCsr->pNext, &v); } - pCsr->iPos += (v - 2); + pCsr->iPos += (int)(v - 2); return SQLITE_OK; } @@ -271,7 +272,7 @@ static int fts3termFilterMethod( pCsr->filter.flags = FTS3_SEGMENT_REQUIRE_POS|FTS3_SEGMENT_IGNORE_EMPTY; pCsr->filter.flags |= FTS3_SEGMENT_SCAN; - rc = sqlite3Fts3SegReaderCursor(pFts3, p->iIndex, FTS3_SEGCURSOR_ALL, + rc = sqlite3Fts3SegReaderCursor(pFts3, 0, p->iIndex, FTS3_SEGCURSOR_ALL, pCsr->filter.zTerm, pCsr->filter.nTerm, 0, 1, &pCsr->csr ); if( rc==SQLITE_OK ){ @@ -357,7 +358,10 @@ int sqlite3Fts3InitTerm(sqlite3 *db){ 0, /* xCommit */ 0, /* xRollback */ 0, /* xFindFunction */ - 0 /* xRename */ + 0, /* xRename */ + 0, /* xSavepoint */ + 0, /* xRelease */ + 0 /* xRollbackTo */ }; int rc; /* Return code */ diff --git a/src/libtracker-fts/fts3_test.c b/src/libtracker-fts/fts3_test.c index 72735f3d1..4da0b8f13 100644 --- a/src/libtracker-fts/fts3_test.c +++ b/src/libtracker-fts/fts3_test.c @@ -13,13 +13,17 @@ ** This file is not part of the production FTS code. It is only used for ** testing. It contains a Tcl command that can be used to test if a document ** matches an FTS NEAR expression. +** +** As of March 2012, it also contains a version 1 tokenizer used for testing +** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly. */ #include <tcl.h> #include <string.h> #include <assert.h> -#ifdef SQLITE_TEST +#if defined(SQLITE_TEST) +#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */ #include "fts3Int.h" @@ -158,6 +162,8 @@ static int fts3_near_match_cmd( Tcl_Obj **apExprToken; int nExprToken; + UNUSED_PARAMETER(clientData); + /* Must have 3 or more arguments. */ if( objc<3 || (objc%2)==0 ){ Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?..."); @@ -311,14 +317,219 @@ static int fts3_configure_incr_load_cmd( Tcl_SetObjResult(interp, pRet); Tcl_DecrRefCount(pRet); #endif + UNUSED_PARAMETER(clientData); + return TCL_OK; +} + +#ifdef SQLITE_ENABLE_FTS3 +/************************************************************************** +** Beginning of test tokenizer code. +** +** For language 0, this tokenizer is similar to the default 'simple' +** tokenizer. For other languages L, the following: +** +** * Odd numbered languages are case-sensitive. Even numbered +** languages are not. +** +** * Language ids 100 or greater are considered an error. +** +** The implementation assumes that the input contains only ASCII characters +** (i.e. those that may be encoded in UTF-8 using a single byte). +*/ +typedef struct test_tokenizer { + sqlite3_tokenizer base; +} test_tokenizer; + +typedef struct test_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + const char *aInput; /* Input being tokenized */ + int nInput; /* Size of the input in bytes */ + int iInput; /* Current offset in aInput */ + int iToken; /* Index of next token to be returned */ + char *aBuffer; /* Buffer containing current token */ + int nBuffer; /* Number of bytes allocated at pToken */ + int iLangid; /* Configured language id */ +} test_tokenizer_cursor; + +static int testTokenizerCreate( + int argc, const char * const *argv, + sqlite3_tokenizer **ppTokenizer +){ + test_tokenizer *pNew; + UNUSED_PARAMETER(argc); + UNUSED_PARAMETER(argv); + + pNew = sqlite3_malloc(sizeof(test_tokenizer)); + if( !pNew ) return SQLITE_NOMEM; + memset(pNew, 0, sizeof(test_tokenizer)); + + *ppTokenizer = (sqlite3_tokenizer *)pNew; + return SQLITE_OK; +} + +static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){ + test_tokenizer *p = (test_tokenizer *)pTokenizer; + sqlite3_free(p); + return SQLITE_OK; +} + +static int testTokenizerOpen( + sqlite3_tokenizer *pTokenizer, /* The tokenizer */ + const char *pInput, int nBytes, /* String to be tokenized */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ +){ + int rc = SQLITE_OK; /* Return code */ + test_tokenizer_cursor *pCsr; /* New cursor object */ + + UNUSED_PARAMETER(pTokenizer); + + pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor)); + if( pCsr==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(pCsr, 0, sizeof(test_tokenizer_cursor)); + pCsr->aInput = pInput; + if( nBytes<0 ){ + pCsr->nInput = (int)strlen(pInput); + }else{ + pCsr->nInput = nBytes; + } + } + + *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; + return rc; +} + +static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){ + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + sqlite3_free(pCsr->aBuffer); + sqlite3_free(pCsr); + return SQLITE_OK; +} + +static int testIsTokenChar(char c){ + return (c>='a' && c<='z') || (c>='A' && c<='Z'); +} +static int testTolower(char c){ + char ret = c; + if( ret>='A' && ret<='Z') ret = ret - ('A'-'a'); + return ret; +} + +static int testTokenizerNext( + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */ + const char **ppToken, /* OUT: *ppToken is the token text */ + int *pnBytes, /* OUT: Number of bytes in token */ + int *piStartOffset, /* OUT: Starting offset of token */ + int *piEndOffset, /* OUT: Ending offset of token */ + int *piPosition /* OUT: Position integer of token */ +){ + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + int rc = SQLITE_OK; + const char *p; + const char *pEnd; + + p = &pCsr->aInput[pCsr->iInput]; + pEnd = &pCsr->aInput[pCsr->nInput]; + + /* Skip past any white-space */ + assert( p<=pEnd ); + while( p<pEnd && testIsTokenChar(*p)==0 ) p++; + + if( p==pEnd ){ + rc = SQLITE_DONE; + }else{ + /* Advance to the end of the token */ + const char *pToken = p; + int nToken; + while( p<pEnd && testIsTokenChar(*p) ) p++; + nToken = (int)(p-pToken); + + /* Copy the token into the buffer */ + if( nToken>pCsr->nBuffer ){ + sqlite3_free(pCsr->aBuffer); + pCsr->aBuffer = sqlite3_malloc(nToken); + } + if( pCsr->aBuffer==0 ){ + rc = SQLITE_NOMEM; + }else{ + int i; + + if( pCsr->iLangid & 0x00000001 ){ + for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i]; + }else{ + for(i=0; i<nToken; i++) pCsr->aBuffer[i] = testTolower(pToken[i]); + } + pCsr->iToken++; + pCsr->iInput = (int)(p - pCsr->aInput); + + *ppToken = pCsr->aBuffer; + *pnBytes = nToken; + *piStartOffset = (int)(pToken - pCsr->aInput); + *piEndOffset = (int)(p - pCsr->aInput); + *piPosition = pCsr->iToken; + } + } + + return rc; +} + +static int testTokenizerLanguage( + sqlite3_tokenizer_cursor *pCursor, + int iLangid +){ + int rc = SQLITE_OK; + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + pCsr->iLangid = iLangid; + if( pCsr->iLangid>=100 ){ + rc = SQLITE_ERROR; + } + return rc; +} +#endif + +static int fts3_test_tokenizer_cmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ +#ifdef SQLITE_ENABLE_FTS3 + static const sqlite3_tokenizer_module testTokenizerModule = { + 1, + testTokenizerCreate, + testTokenizerDestroy, + testTokenizerOpen, + testTokenizerClose, + testTokenizerNext, + testTokenizerLanguage + }; + const sqlite3_tokenizer_module *pPtr = &testTokenizerModule; + if( objc!=1 ){ + Tcl_WrongNumArgs(interp, 1, objv, ""); + return TCL_ERROR; + } + Tcl_SetObjResult(interp, Tcl_NewByteArrayObj( + (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *) + )); +#endif + UNUSED_PARAMETER(clientData); return TCL_OK; } +/* +** End of tokenizer code. +**************************************************************************/ + int Sqlitetestfts3_Init(Tcl_Interp *interp){ Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0); Tcl_CreateObjCommand(interp, "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0 ); + Tcl_CreateObjCommand( + interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0 + ); return TCL_OK; } +#endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */ #endif /* ifdef SQLITE_TEST */ diff --git a/src/libtracker-fts/fts3_tokenizer.c b/src/libtracker-fts/fts3_tokenizer.c index 6494bb96d..8241be81f 100644 --- a/src/libtracker-fts/fts3_tokenizer.c +++ b/src/libtracker-fts/fts3_tokenizer.c @@ -209,10 +209,9 @@ int sqlite3Fts3InitTokenizer( /* ** Implementation of a special SQL scalar function for testing tokenizers ** designed to be used in concert with the Tcl testing framework. This -** function must be called with two arguments: +** function must be called with two or more arguments: ** -** SELECT <function-name>(<key-name>, <input-string>); -** SELECT <function-name>(<key-name>, <pointer>); +** SELECT <function-name>(<key-name>, ..., <input-string>); ** ** where <function-name> is the name passed as the second argument ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer') @@ -249,27 +248,27 @@ static void testFunc( const char *zInput; int nInput; - const char *zArg = 0; + const char *azArg[64]; const char *zToken; - int nToken; - int iStart; - int iEnd; - int iPos; + int nToken = 0; + int iStart = 0; + int iEnd = 0; + int iPos = 0; + int i; Tcl_Obj *pRet; - assert( argc==2 || argc==3 ); + if( argc<2 ){ + sqlite3_result_error(context, "insufficient arguments", -1); + return; + } nName = sqlite3_value_bytes(argv[0]); zName = (const char *)sqlite3_value_text(argv[0]); nInput = sqlite3_value_bytes(argv[argc-1]); zInput = (const char *)sqlite3_value_text(argv[argc-1]); - if( argc==3 ){ - zArg = (const char *)sqlite3_value_text(argv[1]); - } - pHash = (Fts3Hash *)sqlite3_user_data(context); p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1); @@ -283,16 +282,19 @@ static void testFunc( pRet = Tcl_NewObj(); Tcl_IncrRefCount(pRet); - if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ + for(i=1; i<argc-1; i++){ + azArg[i-1] = (const char *)sqlite3_value_text(argv[i]); + } + + if( SQLITE_OK!=p->xCreate(argc-2, azArg, &pTokenizer) ){ zErr = "error in xCreate()"; goto finish; } pTokenizer->pModule = p; - if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){ + if( sqlite3Fts3OpenTokenizer(pTokenizer, 0, zInput, nInput, &pCsr) ){ zErr = "error in xOpen()"; goto finish; } - pCsr->pTokenizer = pTokenizer; while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); @@ -468,10 +470,7 @@ int sqlite3Fts3InitHashTable( } #ifdef SQLITE_TEST if( SQLITE_OK==rc ){ - rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0); - } - if( SQLITE_OK==rc ){ - rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0); + rc = sqlite3_create_function(db, zTest, -1, any, p, testFunc, 0, 0); } if( SQLITE_OK==rc ){ rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0); diff --git a/src/libtracker-fts/fts3_tokenizer.h b/src/libtracker-fts/fts3_tokenizer.h index 615644506..c91c7ed79 100644 --- a/src/libtracker-fts/fts3_tokenizer.h +++ b/src/libtracker-fts/fts3_tokenizer.h @@ -52,7 +52,7 @@ typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; struct sqlite3_tokenizer_module { /* - ** Structure version. Should always be set to 0. + ** Structure version. Should always be set to 0 or 1. */ int iVersion; @@ -133,6 +133,15 @@ struct sqlite3_tokenizer_module { int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ int *piPosition /* OUT: Number of tokens returned before this one */ ); + + /*********************************************************************** + ** Methods below this point are only available if iVersion>=1. + */ + + /* + ** Configure the language id of a tokenizer cursor. + */ + int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); }; struct sqlite3_tokenizer { diff --git a/src/libtracker-fts/fts3_tokenizer1.c b/src/libtracker-fts/fts3_tokenizer1.c index d11a49976..deea06d92 100644 --- a/src/libtracker-fts/fts3_tokenizer1.c +++ b/src/libtracker-fts/fts3_tokenizer1.c @@ -218,6 +218,7 @@ static const sqlite3_tokenizer_module simpleTokenizerModule = { simpleOpen, simpleClose, simpleNext, + 0, }; /* diff --git a/src/libtracker-fts/fts3_unicode.c b/src/libtracker-fts/fts3_unicode.c new file mode 100644 index 000000000..79941edbb --- /dev/null +++ b/src/libtracker-fts/fts3_unicode.c @@ -0,0 +1,393 @@ +/* +** 2012 May 24 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** Implementation of the "unicode" full-text-search tokenizer. +*/ + +#ifdef SQLITE_ENABLE_FTS4_UNICODE61 + +#include "fts3Int.h" +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) + +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include "fts3_tokenizer.h" + +/* +** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied +** from the sqlite3 source file utf.c. If this file is compiled as part +** of the amalgamation, they are not required. +*/ +#ifndef SQLITE_AMALGAMATION + +static const unsigned char sqlite3Utf8Trans1[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, +}; + +#define READ_UTF8(zIn, zTerm, c) \ + c = *(zIn++); \ + if( c>=0xc0 ){ \ + c = sqlite3Utf8Trans1[c-0xc0]; \ + while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ + c = (c<<6) + (0x3f & *(zIn++)); \ + } \ + if( c<0x80 \ + || (c&0xFFFFF800)==0xD800 \ + || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ + } + +#define WRITE_UTF8(zOut, c) { \ + if( c<0x00080 ){ \ + *zOut++ = (u8)(c&0xFF); \ + } \ + else if( c<0x00800 ){ \ + *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \ + *zOut++ = 0x80 + (u8)(c & 0x3F); \ + } \ + else if( c<0x10000 ){ \ + *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \ + *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (u8)(c & 0x3F); \ + }else{ \ + *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \ + *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \ + *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ + *zOut++ = 0x80 + (u8)(c & 0x3F); \ + } \ +} + +#endif /* ifndef SQLITE_AMALGAMATION */ + +typedef struct unicode_tokenizer unicode_tokenizer; +typedef struct unicode_cursor unicode_cursor; + +struct unicode_tokenizer { + sqlite3_tokenizer base; + int bRemoveDiacritic; + int nException; + int *aiException; +}; + +struct unicode_cursor { + sqlite3_tokenizer_cursor base; + const unsigned char *aInput; /* Input text being tokenized */ + int nInput; /* Size of aInput[] in bytes */ + int iOff; /* Current offset within aInput[] */ + int iToken; /* Index of next token to be returned */ + char *zToken; /* storage for current token */ + int nAlloc; /* space allocated at zToken */ +}; + + +/* +** Destroy a tokenizer allocated by unicodeCreate(). +*/ +static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){ + if( pTokenizer ){ + unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer; + sqlite3_free(p->aiException); + sqlite3_free(p); + } + return SQLITE_OK; +} + +/* +** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE +** statement has specified that the tokenizer for this table shall consider +** all characters in string zIn/nIn to be separators (if bAlnum==0) or +** token characters (if bAlnum==1). +** +** For each codepoint in the zIn/nIn string, this function checks if the +** sqlite3FtsUnicodeIsalnum() function already returns the desired result. +** If so, no action is taken. Otherwise, the codepoint is added to the +** unicode_tokenizer.aiException[] array. For the purposes of tokenization, +** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all +** codepoints in the aiException[] array. +** +** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic() +** identifies as a diacritic) occurs in the zIn/nIn string it is ignored. +** It is not possible to change the behaviour of the tokenizer with respect +** to these codepoints. +*/ +static int unicodeAddExceptions( + unicode_tokenizer *p, /* Tokenizer to add exceptions to */ + int bAlnum, /* Replace Isalnum() return value with this */ + const char *zIn, /* Array of characters to make exceptions */ + int nIn /* Length of z in bytes */ +){ + const unsigned char *z = (const unsigned char *)zIn; + const unsigned char *zTerm = &z[nIn]; + int iCode; + int nEntry = 0; + + assert( bAlnum==0 || bAlnum==1 ); + + while( z<zTerm ){ + READ_UTF8(z, zTerm, iCode); + assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); + if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum + && sqlite3FtsUnicodeIsdiacritic(iCode)==0 + ){ + nEntry++; + } + } + + if( nEntry ){ + int *aNew; /* New aiException[] array */ + int nNew; /* Number of valid entries in array aNew[] */ + + aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int)); + if( aNew==0 ) return SQLITE_NOMEM; + nNew = p->nException; + + z = (const unsigned char *)zIn; + while( z<zTerm ){ + READ_UTF8(z, zTerm, iCode); + if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum + && sqlite3FtsUnicodeIsdiacritic(iCode)==0 + ){ + int i, j; + for(i=0; i<nNew && aNew[i]<iCode; i++); + for(j=nNew; j>i; j--) aNew[j] = aNew[j-1]; + aNew[i] = iCode; + nNew++; + } + } + p->aiException = aNew; + p->nException = nNew; + } + + return SQLITE_OK; +} + +/* +** Return true if the p->aiException[] array contains the value iCode. +*/ +static int unicodeIsException(unicode_tokenizer *p, int iCode){ + if( p->nException>0 ){ + int *a = p->aiException; + int iLo = 0; + int iHi = p->nException-1; + + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( iCode==a[iTest] ){ + return 1; + }else if( iCode>a[iTest] ){ + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + } + + return 0; +} + +/* +** Return true if, for the purposes of tokenization, codepoint iCode is +** considered a token character (not a separator). +*/ +static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){ + assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); + return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode); +} + +/* +** Create a new tokenizer instance. +*/ +static int unicodeCreate( + int nArg, /* Size of array argv[] */ + const char * const *azArg, /* Tokenizer creation arguments */ + sqlite3_tokenizer **pp /* OUT: New tokenizer handle */ +){ + unicode_tokenizer *pNew; /* New tokenizer object */ + int i; + int rc = SQLITE_OK; + + pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer)); + if( pNew==NULL ) return SQLITE_NOMEM; + memset(pNew, 0, sizeof(unicode_tokenizer)); + pNew->bRemoveDiacritic = 1; + + for(i=0; rc==SQLITE_OK && i<nArg; i++){ + const char *z = azArg[i]; + int n = strlen(z); + + if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){ + pNew->bRemoveDiacritic = 1; + } + else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){ + pNew->bRemoveDiacritic = 0; + } + else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){ + rc = unicodeAddExceptions(pNew, 1, &z[11], n-11); + } + else if( n>=11 && memcmp("separators=", z, 11)==0 ){ + rc = unicodeAddExceptions(pNew, 0, &z[11], n-11); + } + else{ + /* Unrecognized argument */ + rc = SQLITE_ERROR; + } + } + + if( rc!=SQLITE_OK ){ + unicodeDestroy((sqlite3_tokenizer *)pNew); + pNew = 0; + } + *pp = (sqlite3_tokenizer *)pNew; + return rc; +} + +/* +** Prepare to begin tokenizing a particular string. The input +** string to be tokenized is pInput[0..nBytes-1]. A cursor +** used to incrementally tokenize this string is returned in +** *ppCursor. +*/ +static int unicodeOpen( + sqlite3_tokenizer *p, /* The tokenizer */ + const char *aInput, /* Input string */ + int nInput, /* Size of string aInput in bytes */ + sqlite3_tokenizer_cursor **pp /* OUT: New cursor object */ +){ + unicode_cursor *pCsr; + + pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor)); + if( pCsr==0 ){ + return SQLITE_NOMEM; + } + memset(pCsr, 0, sizeof(unicode_cursor)); + + pCsr->aInput = (const unsigned char *)aInput; + if( aInput==0 ){ + pCsr->nInput = 0; + }else if( nInput<0 ){ + pCsr->nInput = (int)strlen(aInput); + }else{ + pCsr->nInput = nInput; + } + + *pp = &pCsr->base; + UNUSED_PARAMETER(p); + return SQLITE_OK; +} + +/* +** Close a tokenization cursor previously opened by a call to +** simpleOpen() above. +*/ +static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){ + unicode_cursor *pCsr = (unicode_cursor *) pCursor; + sqlite3_free(pCsr->zToken); + sqlite3_free(pCsr); + return SQLITE_OK; +} + +/* +** Extract the next token from a tokenization cursor. The cursor must +** have been opened by a prior call to simpleOpen(). +*/ +static int unicodeNext( + sqlite3_tokenizer_cursor *pC, /* Cursor returned by simpleOpen */ + const char **paToken, /* OUT: Token text */ + int *pnToken, /* OUT: Number of bytes at *paToken */ + int *piStart, /* OUT: Starting offset of token */ + int *piEnd, /* OUT: Ending offset of token */ + int *piPos /* OUT: Position integer of token */ +){ + unicode_cursor *pCsr = (unicode_cursor *)pC; + unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer); + int iCode; + char *zOut; + const unsigned char *z = &pCsr->aInput[pCsr->iOff]; + const unsigned char *zStart = z; + const unsigned char *zEnd; + const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput]; + + /* Scan past any delimiter characters before the start of the next token. + ** Return SQLITE_DONE early if this takes us all the way to the end of + ** the input. */ + while( z<zTerm ){ + READ_UTF8(z, zTerm, iCode); + if( unicodeIsAlnum(p, iCode) ) break; + zStart = z; + } + if( zStart>=zTerm ) return SQLITE_DONE; + + zOut = pCsr->zToken; + do { + int iOut; + + /* Grow the output buffer if required. */ + if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){ + char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64); + if( !zNew ) return SQLITE_NOMEM; + zOut = &zNew[zOut - pCsr->zToken]; + pCsr->zToken = zNew; + pCsr->nAlloc += 64; + } + + /* Write the folded case of the last character read to the output */ + zEnd = z; + iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic); + if( iOut ){ + WRITE_UTF8(zOut, iOut); + } + + /* If the cursor is not at EOF, read the next character */ + if( z>=zTerm ) break; + READ_UTF8(z, zTerm, iCode); + }while( unicodeIsAlnum(p, iCode) + || sqlite3FtsUnicodeIsdiacritic(iCode) + ); + + /* Set the output variables and return. */ + pCsr->iOff = (z - pCsr->aInput); + *paToken = pCsr->zToken; + *pnToken = zOut - pCsr->zToken; + *piStart = (zStart - pCsr->aInput); + *piEnd = (zEnd - pCsr->aInput); + *piPos = pCsr->iToken++; + return SQLITE_OK; +} + +/* +** Set *ppModule to a pointer to the sqlite3_tokenizer_module +** structure for the unicode tokenizer. +*/ +void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){ + static const sqlite3_tokenizer_module module = { + 0, + unicodeCreate, + unicodeDestroy, + unicodeOpen, + unicodeClose, + unicodeNext, + 0, + }; + *ppModule = &module; +} + +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ +#endif /* ifndef SQLITE_ENABLE_FTS4_UNICODE61 */ diff --git a/src/libtracker-fts/fts3_unicode2.c b/src/libtracker-fts/fts3_unicode2.c new file mode 100644 index 000000000..3c2456902 --- /dev/null +++ b/src/libtracker-fts/fts3_unicode2.c @@ -0,0 +1,366 @@ +/* +** 2012 May 25 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +*/ + +/* +** DO NOT EDIT THIS MACHINE GENERATED FILE. +*/ + +#if defined(SQLITE_ENABLE_FTS4_UNICODE61) +#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) + +#include <assert.h> + +/* +** Return true if the argument corresponds to a unicode codepoint +** classified as either a letter or a number. Otherwise false. +** +** The results are undefined if the value passed to this function +** is less than zero. +*/ +int sqlite3FtsUnicodeIsalnum(int c){ + /* Each unsigned integer in the following array corresponds to a contiguous + ** range of unicode codepoints that are not either letters or numbers (i.e. + ** codepoints for which this function should return 0). + ** + ** The most significant 22 bits in each 32-bit value contain the first + ** codepoint in the range. The least significant 10 bits are used to store + ** the size of the range (always at least 1). In other words, the value + ** ((C<<22) + N) represents a range of N codepoints starting with codepoint + ** C. It is not possible to represent a range larger than 1023 codepoints + ** using this format. + */ + const static unsigned int aEntry[] = { + 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07, + 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01, + 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401, + 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01, + 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01, + 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802, + 0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F, + 0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401, + 0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804, + 0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403, + 0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812, + 0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001, + 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802, + 0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805, + 0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401, + 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03, + 0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807, + 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001, + 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01, + 0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804, + 0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001, + 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802, + 0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01, + 0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06, + 0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007, + 0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006, + 0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417, + 0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14, + 0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07, + 0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01, + 0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001, + 0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802, + 0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F, + 0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002, + 0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802, + 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006, + 0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D, + 0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802, + 0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027, + 0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403, + 0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805, + 0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04, + 0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401, + 0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005, + 0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B, + 0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A, + 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001, + 0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59, + 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807, + 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01, + 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E, + 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100, + 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10, + 0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402, + 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, + 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012, + 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004, + 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002, + 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803, + 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07, + 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, + 0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, + 0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, + 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, + 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, + 0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, + 0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, + 0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, + 0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, + 0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, + 0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, + 0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, + 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, + 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, + 0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, + 0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, + 0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, + 0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, + 0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, + 0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, + 0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, + 0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001, + 0x43FFF401, + }; + static const unsigned int aAscii[4] = { + 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001, + }; + + if( c<128 ){ + return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 ); + }else if( c<(1<<22) ){ + unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; + int iRes; + int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; + int iLo = 0; + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( key >= aEntry[iTest] ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + assert( aEntry[0]<key ); + assert( key>=aEntry[iRes] ); + return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); + } + return 1; +} + + +/* +** If the argument is a codepoint corresponding to a lowercase letter +** in the ASCII range with a diacritic added, return the codepoint +** of the ASCII letter only. For example, if passed 235 - "LATIN +** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER +** E"). The resuls of passing a codepoint that corresponds to an +** uppercase letter are undefined. +*/ +static int remove_diacritic(int c){ + unsigned short aDia[] = { + 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, + 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, + 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, + 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, + 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, + 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, + 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, + 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, + 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, + 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, + 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, + 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, + 62924, 63050, 63082, 63274, 63390, + }; + char aChar[] = { + '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', + 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', + 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', + 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', + 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', + 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', + 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', + 'e', 'i', 'o', 'u', 'y', + }; + + unsigned int key = (((unsigned int)c)<<3) | 0x00000007; + int iRes = 0; + int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; + int iLo = 0; + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( key >= aDia[iTest] ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + assert( key>=aDia[iRes] ); + return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); +}; + + +/* +** Return true if the argument interpreted as a unicode codepoint +** is a diacritical modifier character. +*/ +int sqlite3FtsUnicodeIsdiacritic(int c){ + unsigned int mask0 = 0x08029FDF; + unsigned int mask1 = 0x000361F8; + if( c<768 || c>817 ) return 0; + return (c < 768+32) ? + (mask0 & (1 << (c-768))) : + (mask1 & (1 << (c-768-32))); +} + + +/* +** Interpret the argument as a unicode codepoint. If the codepoint +** is an upper case character that has a lower case equivalent, +** return the codepoint corresponding to the lower case version. +** Otherwise, return a copy of the argument. +** +** The results are undefined if the value passed to this function +** is less than zero. +*/ +int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){ + /* Each entry in the following array defines a rule for folding a range + ** of codepoints to lower case. The rule applies to a range of nRange + ** codepoints starting at codepoint iCode. + ** + ** If the least significant bit in flags is clear, then the rule applies + ** to all nRange codepoints (i.e. all nRange codepoints are upper case and + ** need to be folded). Or, if it is set, then the rule only applies to + ** every second codepoint in the range, starting with codepoint C. + ** + ** The 7 most significant bits in flags are an index into the aiOff[] + ** array. If a specific codepoint C does require folding, then its lower + ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). + ** + ** The contents of this array are generated by parsing the CaseFolding.txt + ** file distributed as part of the "Unicode Character Database". See + ** http://www.unicode.org for details. + */ + static const struct TableEntry { + unsigned short iCode; + unsigned char flags; + unsigned char nRange; + } aEntry[] = { + {65, 14, 26}, {181, 64, 1}, {192, 14, 23}, + {216, 14, 7}, {256, 1, 48}, {306, 1, 6}, + {313, 1, 16}, {330, 1, 46}, {376, 116, 1}, + {377, 1, 6}, {383, 104, 1}, {385, 50, 1}, + {386, 1, 4}, {390, 44, 1}, {391, 0, 1}, + {393, 42, 2}, {395, 0, 1}, {398, 32, 1}, + {399, 38, 1}, {400, 40, 1}, {401, 0, 1}, + {403, 42, 1}, {404, 46, 1}, {406, 52, 1}, + {407, 48, 1}, {408, 0, 1}, {412, 52, 1}, + {413, 54, 1}, {415, 56, 1}, {416, 1, 6}, + {422, 60, 1}, {423, 0, 1}, {425, 60, 1}, + {428, 0, 1}, {430, 60, 1}, {431, 0, 1}, + {433, 58, 2}, {435, 1, 4}, {439, 62, 1}, + {440, 0, 1}, {444, 0, 1}, {452, 2, 1}, + {453, 0, 1}, {455, 2, 1}, {456, 0, 1}, + {458, 2, 1}, {459, 1, 18}, {478, 1, 18}, + {497, 2, 1}, {498, 1, 4}, {502, 122, 1}, + {503, 134, 1}, {504, 1, 40}, {544, 110, 1}, + {546, 1, 18}, {570, 70, 1}, {571, 0, 1}, + {573, 108, 1}, {574, 68, 1}, {577, 0, 1}, + {579, 106, 1}, {580, 28, 1}, {581, 30, 1}, + {582, 1, 10}, {837, 36, 1}, {880, 1, 4}, + {886, 0, 1}, {902, 18, 1}, {904, 16, 3}, + {908, 26, 1}, {910, 24, 2}, {913, 14, 17}, + {931, 14, 9}, {962, 0, 1}, {975, 4, 1}, + {976, 140, 1}, {977, 142, 1}, {981, 146, 1}, + {982, 144, 1}, {984, 1, 24}, {1008, 136, 1}, + {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1}, + {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1}, + {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32}, + {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1}, + {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38}, + {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1}, + {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1}, + {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6}, + {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6}, + {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8}, + {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2}, + {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1}, + {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2}, + {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2}, + {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2}, + {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1}, + {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16}, + {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47}, + {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1}, + {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1}, + {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1}, + {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2}, + {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1}, + {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14}, + {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1}, + {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1}, + {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1}, + {65313, 14, 26}, + }; + static const unsigned short aiOff[] = { + 1, 2, 8, 15, 16, 26, 28, 32, + 37, 38, 40, 48, 63, 64, 69, 71, + 79, 80, 116, 202, 203, 205, 206, 207, + 209, 210, 211, 213, 214, 217, 218, 219, + 775, 7264, 10792, 10795, 23228, 23256, 30204, 54721, + 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, + 57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, + 65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, + 65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, + 65514, 65521, 65527, 65528, 65529, + }; + + int ret = c; + + assert( c>=0 ); + assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); + + if( c<128 ){ + if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); + }else if( c<65536 ){ + int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; + int iLo = 0; + int iRes = -1; + + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + int cmp = (c - aEntry[iTest].iCode); + if( cmp>=0 ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + assert( iRes<0 || c>=aEntry[iRes].iCode ); + + if( iRes>=0 ){ + const struct TableEntry *p = &aEntry[iRes]; + if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ + ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; + assert( ret>0 ); + } + } + + if( bRemoveDiacritic ) ret = remove_diacritic(ret); + } + + else if( c>=66560 && c<66600 ){ + ret = c + 40; + } + + return ret; +} +#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */ +#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */ diff --git a/src/libtracker-fts/fts3_write.c b/src/libtracker-fts/fts3_write.c index 36f2249e1..bda7fbbe1 100644 --- a/src/libtracker-fts/fts3_write.c +++ b/src/libtracker-fts/fts3_write.c @@ -24,6 +24,9 @@ #include <assert.h> #include <stdlib.h> + +#define FTS_MAX_APPENDABLE_HEIGHT 16 + /* ** When full-text index nodes are loaded from disk, the buffer that they ** are loaded into has the following number of bytes of padding at the end @@ -63,6 +66,29 @@ int test_fts3_node_chunk_threshold = (4*1024)*4; # define FTS3_NODE_CHUNK_THRESHOLD (FTS3_NODE_CHUNKSIZE*4) #endif +/* +** The two values that may be meaningfully bound to the :1 parameter in +** statements SQL_REPLACE_STAT and SQL_SELECT_STAT. +*/ +#define FTS_STAT_DOCTOTAL 0 +#define FTS_STAT_INCRMERGEHINT 1 +#define FTS_STAT_AUTOINCRMERGE 2 + +/* +** If FTS_LOG_MERGES is defined, call sqlite3_log() to report each automatic +** and incremental merge operation that takes place. This is used for +** debugging FTS only, it should not usually be turned on in production +** systems. +*/ +#ifdef FTS3_LOG_MERGES +static void fts3LogMerge(int nMerge, sqlite3_int64 iAbsLevel){ + sqlite3_log(SQLITE_OK, "%d-way merge from level %d", nMerge, (int)iAbsLevel); +} +#else +#define fts3LogMerge(x, y) +#endif + + typedef struct PendingList PendingList; typedef struct SegmentNode SegmentNode; typedef struct SegmentWriter SegmentWriter; @@ -110,6 +136,8 @@ struct Fts3DeferredToken { */ struct Fts3SegReader { int iIdx; /* Index within level, or 0x7FFFFFFF for PT */ + u8 bLookup; /* True for a lookup only */ + u8 rootOnly; /* True for a root-only reader */ sqlite3_int64 iStartBlock; /* Rowid of first leaf block to traverse */ sqlite3_int64 iLeafEndBlock; /* Rowid of final leaf block to traverse */ @@ -143,7 +171,7 @@ struct Fts3SegReader { }; #define fts3SegReaderIsPending(p) ((p)->ppNextElem!=0) -#define fts3SegReaderIsRootOnly(p) ((p)->aNode==(char *)&(p)[1]) +#define fts3SegReaderIsRootOnly(p) ((p)->rootOnly!=0) /* ** An instance of this structure is used to create a segment b-tree in the @@ -223,13 +251,22 @@ struct SegmentNode { #define SQL_DELETE_DOCSIZE 19 #define SQL_REPLACE_DOCSIZE 20 #define SQL_SELECT_DOCSIZE 21 -#define SQL_SELECT_DOCTOTAL 22 -#define SQL_REPLACE_DOCTOTAL 23 +#define SQL_SELECT_STAT 22 +#define SQL_REPLACE_STAT 23 #define SQL_SELECT_ALL_PREFIX_LEVEL 24 #define SQL_DELETE_ALL_TERMS_SEGDIR 25 - #define SQL_DELETE_SEGDIR_RANGE 26 +#define SQL_SELECT_ALL_LANGID 27 +#define SQL_FIND_MERGE_LEVEL 28 +#define SQL_MAX_LEAF_NODE_ESTIMATE 29 +#define SQL_DELETE_SEGDIR_ENTRY 30 +#define SQL_SHIFT_SEGDIR_ENTRY 31 +#define SQL_SELECT_SEGDIR 32 +#define SQL_CHOMP_SEGDIR 33 +#define SQL_SEGMENT_IS_APPENDABLE 34 +#define SQL_SELECT_INDEXES 35 +#define SQL_SELECT_MXLEVEL 36 /* ** This function is used to obtain an SQLite prepared statement handle @@ -256,11 +293,11 @@ static int fts3SqlStmt( /* 4 */ "DELETE FROM %Q.'%q_segdir'", /* 5 */ "DELETE FROM %Q.'%q_docsize'", /* 6 */ "DELETE FROM %Q.'%q_stat'", -/* 7 */ "SELECT %s FROM %Q.'%q_content' AS x WHERE rowid=?", +/* 7 */ "SELECT %s WHERE rowid=?", /* 8 */ "SELECT (SELECT max(idx) FROM %Q.'%q_segdir' WHERE level = ?) + 1", -/* 9 */ "INSERT INTO %Q.'%q_segments'(blockid, block) VALUES(?, ?)", +/* 9 */ "REPLACE INTO %Q.'%q_segments'(blockid, block) VALUES(?, ?)", /* 10 */ "SELECT coalesce((SELECT max(blockid) FROM %Q.'%q_segments') + 1, 1)", -/* 11 */ "INSERT INTO %Q.'%q_segdir' VALUES(?,?,?,?,?,?)", +/* 11 */ "REPLACE INTO %Q.'%q_segdir' VALUES(?,?,?,?,?,?)", /* Return segments in order from oldest to newest.*/ /* 12 */ "SELECT idx, start_block, leaves_end_block, end_block, root " @@ -278,13 +315,61 @@ static int fts3SqlStmt( /* 19 */ "DELETE FROM %Q.'%q_docsize' WHERE docid = ?", /* 20 */ "REPLACE INTO %Q.'%q_docsize' VALUES(?,?)", /* 21 */ "SELECT size FROM %Q.'%q_docsize' WHERE docid=?", -/* 22 */ "SELECT value FROM %Q.'%q_stat' WHERE id=0", -/* 23 */ "REPLACE INTO %Q.'%q_stat' VALUES(0,?)", +/* 22 */ "SELECT value FROM %Q.'%q_stat' WHERE id=?", +/* 23 */ "REPLACE INTO %Q.'%q_stat' VALUES(?,?)", /* 24 */ "", /* 25 */ "", /* 26 */ "DELETE FROM %Q.'%q_segdir' WHERE level BETWEEN ? AND ?", - +/* 27 */ "SELECT DISTINCT level / (1024 * ?) FROM %Q.'%q_segdir'", + +/* This statement is used to determine which level to read the input from +** when performing an incremental merge. It returns the absolute level number +** of the oldest level in the db that contains at least ? segments. Or, +** if no level in the FTS index contains more than ? segments, the statement +** returns zero rows. */ +/* 28 */ "SELECT level FROM %Q.'%q_segdir' GROUP BY level HAVING count(*)>=?" + " ORDER BY (level %% 1024) ASC LIMIT 1", + +/* Estimate the upper limit on the number of leaf nodes in a new segment +** created by merging the oldest :2 segments from absolute level :1. See +** function sqlite3Fts3Incrmerge() for details. */ +/* 29 */ "SELECT 2 * total(1 + leaves_end_block - start_block) " + " FROM %Q.'%q_segdir' WHERE level = ? AND idx < ?", + +/* SQL_DELETE_SEGDIR_ENTRY +** Delete the %_segdir entry on absolute level :1 with index :2. */ +/* 30 */ "DELETE FROM %Q.'%q_segdir' WHERE level = ? AND idx = ?", + +/* SQL_SHIFT_SEGDIR_ENTRY +** Modify the idx value for the segment with idx=:3 on absolute level :2 +** to :1. */ +/* 31 */ "UPDATE %Q.'%q_segdir' SET idx = ? WHERE level=? AND idx=?", + +/* SQL_SELECT_SEGDIR +** Read a single entry from the %_segdir table. The entry from absolute +** level :1 with index value :2. */ +/* 32 */ "SELECT idx, start_block, leaves_end_block, end_block, root " + "FROM %Q.'%q_segdir' WHERE level = ? AND idx = ?", + +/* SQL_CHOMP_SEGDIR +** Update the start_block (:1) and root (:2) fields of the %_segdir +** entry located on absolute level :3 with index :4. */ +/* 33 */ "UPDATE %Q.'%q_segdir' SET start_block = ?, root = ?" + "WHERE level = ? AND idx = ?", + +/* SQL_SEGMENT_IS_APPENDABLE +** Return a single row if the segment with end_block=? is appendable. Or +** no rows otherwise. */ +/* 34 */ "SELECT 1 FROM %Q.'%q_segments' WHERE blockid=? AND block IS NULL", + +/* SQL_SELECT_INDEXES +** Return the list of valid segment indexes for absolute level ? */ +/* 35 */ "SELECT idx FROM %Q.'%q_segdir' WHERE level=? ORDER BY 1 ASC", + +/* SQL_SELECT_MXLEVEL +** Return the largest relative level in the FTS index or indexes. */ +/* 36 */ "SELECT max( level %% 1024 ) FROM %Q.'%q_segdir'" }; int rc = SQLITE_OK; sqlite3_stmt *pStmt; @@ -298,7 +383,7 @@ static int fts3SqlStmt( if( eStmt==SQL_CONTENT_INSERT ){ zSql = sqlite3_mprintf(azSql[eStmt], p->zDb, p->zName, p->zWriteExprlist); }else if( eStmt==SQL_SELECT_CONTENT_BY_ROWID ){ - zSql = sqlite3_mprintf(azSql[eStmt], p->zReadExprlist, p->zDb, p->zName); + zSql = sqlite3_mprintf(azSql[eStmt], p->zReadExprlist); }else{ zSql = sqlite3_mprintf(azSql[eStmt], p->zDb, p->zName); } @@ -322,26 +407,22 @@ static int fts3SqlStmt( return rc; } + static int fts3SelectDocsize( Fts3Table *pTab, /* FTS3 table handle */ - int eStmt, /* Either SQL_SELECT_DOCSIZE or DOCTOTAL */ sqlite3_int64 iDocid, /* Docid to bind for SQL_SELECT_DOCSIZE */ sqlite3_stmt **ppStmt /* OUT: Statement handle */ ){ sqlite3_stmt *pStmt = 0; /* Statement requested from fts3SqlStmt() */ int rc; /* Return code */ - assert( eStmt==SQL_SELECT_DOCSIZE || eStmt==SQL_SELECT_DOCTOTAL ); - - rc = fts3SqlStmt(pTab, eStmt, &pStmt, 0); + rc = fts3SqlStmt(pTab, SQL_SELECT_DOCSIZE, &pStmt, 0); if( rc==SQLITE_OK ){ - if( eStmt==SQL_SELECT_DOCSIZE ){ - sqlite3_bind_int64(pStmt, 1, iDocid); - } + sqlite3_bind_int64(pStmt, 1, iDocid); rc = sqlite3_step(pStmt); if( rc!=SQLITE_ROW || sqlite3_column_type(pStmt, 0)!=SQLITE_BLOB ){ rc = sqlite3_reset(pStmt); - if( rc==SQLITE_OK ) rc = SQLITE_CORRUPT_VTAB; + if( rc==SQLITE_OK ) rc = FTS_CORRUPT_VTAB; pStmt = 0; }else{ rc = SQLITE_OK; @@ -356,7 +437,21 @@ int sqlite3Fts3SelectDoctotal( Fts3Table *pTab, /* Fts3 table handle */ sqlite3_stmt **ppStmt /* OUT: Statement handle */ ){ - return fts3SelectDocsize(pTab, SQL_SELECT_DOCTOTAL, 0, ppStmt); + sqlite3_stmt *pStmt = 0; + int rc; + rc = fts3SqlStmt(pTab, SQL_SELECT_STAT, &pStmt, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int(pStmt, 1, FTS_STAT_DOCTOTAL); + if( sqlite3_step(pStmt)!=SQLITE_ROW + || sqlite3_column_type(pStmt, 0)!=SQLITE_BLOB + ){ + rc = sqlite3_reset(pStmt); + if( rc==SQLITE_OK ) rc = FTS_CORRUPT_VTAB; + pStmt = 0; + } + } + *ppStmt = pStmt; + return rc; } int sqlite3Fts3SelectDocsize( @@ -364,7 +459,7 @@ int sqlite3Fts3SelectDocsize( sqlite3_int64 iDocid, /* Docid to read size data for */ sqlite3_stmt **ppStmt /* OUT: Statement handle */ ){ - return fts3SelectDocsize(pTab, SQL_SELECT_DOCSIZE, iDocid, ppStmt); + return fts3SelectDocsize(pTab, iDocid, ppStmt); } /* @@ -409,21 +504,66 @@ static void fts3SqlExec( ** not what users expect when they get SQLITE_LOCKED_SHAREDCACHE. It can ** still happen if the user reads data directly from the %_segments or ** %_segdir tables instead of going through FTS3 though. +** +** This reasoning does not apply to a content=xxx table. */ int sqlite3Fts3ReadLock(Fts3Table *p){ int rc; /* Return code */ sqlite3_stmt *pStmt; /* Statement used to obtain lock */ - rc = fts3SqlStmt(p, SQL_SELECT_CONTENT_BY_ROWID, &pStmt, 0); - if( rc==SQLITE_OK ){ - sqlite3_bind_null(pStmt, 1); - sqlite3_step(pStmt); - rc = sqlite3_reset(pStmt); + if( p->zContentTbl==0 ){ + rc = fts3SqlStmt(p, SQL_SELECT_CONTENT_BY_ROWID, &pStmt, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_null(pStmt, 1); + sqlite3_step(pStmt); + rc = sqlite3_reset(pStmt); + } + }else{ + rc = SQLITE_OK; } + return rc; } /* +** FTS maintains a separate indexes for each language-id (a 32-bit integer). +** Within each language id, a separate index is maintained to store the +** document terms, and each configured prefix size (configured the FTS +** "prefix=" option). And each index consists of multiple levels ("relative +** levels"). +** +** All three of these values (the language id, the specific index and the +** level within the index) are encoded in 64-bit integer values stored +** in the %_segdir table on disk. This function is used to convert three +** separate component values into the single 64-bit integer value that +** can be used to query the %_segdir table. +** +** Specifically, each language-id/index combination is allocated 1024 +** 64-bit integer level values ("absolute levels"). The main terms index +** for language-id 0 is allocate values 0-1023. The first prefix index +** (if any) for language-id 0 is allocated values 1024-2047. And so on. +** Language 1 indexes are allocated immediately following language 0. +** +** So, for a system with nPrefix prefix indexes configured, the block of +** absolute levels that corresponds to language-id iLangid and index +** iIndex starts at absolute level ((iLangid * (nPrefix+1) + iIndex) * 1024). +*/ +static sqlite3_int64 getAbsoluteLevel( + Fts3Table *p, /* FTS3 table handle */ + int iLangid, /* Language id */ + int iIndex, /* Index in p->aIndex[] */ + int iLevel /* Level of segments */ +){ + sqlite3_int64 iBase; /* First absolute level for iLangid/iIndex */ + assert( iLangid>=0 ); + assert( p->nIndex>0 ); + assert( iIndex>=0 && iIndex<p->nIndex ); + + iBase = ((sqlite3_int64)iLangid * p->nIndex + iIndex) * FTS3_SEGDIR_MAXLEVEL; + return iBase + iLevel; +} + +/* ** Set *ppStmt to a statement handle that may be used to iterate through ** all rows in the %_segdir table, from oldest to newest. If successful, ** return SQLITE_OK. If an error occurs while preparing the statement, @@ -442,8 +582,9 @@ int sqlite3Fts3ReadLock(Fts3Table *p){ */ int sqlite3Fts3AllSegdirs( Fts3Table *p, /* FTS3 table */ + int iLangid, /* Language being queried */ int iIndex, /* Index for p->aIndex[] */ - int iLevel, /* Level to select */ + int iLevel, /* Level to select (relative level) */ sqlite3_stmt **ppStmt /* OUT: Compiled statement */ ){ int rc; @@ -457,14 +598,16 @@ int sqlite3Fts3AllSegdirs( /* "SELECT * FROM %_segdir WHERE level BETWEEN ? AND ? ORDER BY ..." */ rc = fts3SqlStmt(p, SQL_SELECT_LEVEL_RANGE, &pStmt, 0); if( rc==SQLITE_OK ){ - sqlite3_bind_int(pStmt, 1, iIndex*FTS3_SEGDIR_MAXLEVEL); - sqlite3_bind_int(pStmt, 2, (iIndex+1)*FTS3_SEGDIR_MAXLEVEL-1); + sqlite3_bind_int64(pStmt, 1, getAbsoluteLevel(p, iLangid, iIndex, 0)); + sqlite3_bind_int64(pStmt, 2, + getAbsoluteLevel(p, iLangid, iIndex, FTS3_SEGDIR_MAXLEVEL-1) + ); } }else{ /* "SELECT * FROM %_segdir WHERE level = ? ORDER BY ..." */ rc = fts3SqlStmt(p, SQL_SELECT_LEVEL, &pStmt, 0); if( rc==SQLITE_OK ){ - sqlite3_bind_int(pStmt, 1, iLevel+iIndex*FTS3_SEGDIR_MAXLEVEL); + sqlite3_bind_int64(pStmt, 1, getAbsoluteLevel(p, iLangid, iIndex,iLevel)); } } *ppStmt = pStmt; @@ -630,18 +773,19 @@ static int fts3PendingTermsAddOne( */ static int fts3PendingTermsAdd( Fts3Table *p, /* Table into which text will be inserted */ + int iLangid, /* Language id to use */ const char *zText, /* Text of document to be inserted */ int iCol, /* Column into which text is being inserted */ - u32 *pnWord /* OUT: Number of tokens inserted */ + u32 *pnWord /* IN/OUT: Incr. by number tokens inserted */ ){ int rc; - int iStart; - int iEnd; - int iPos; + int iStart = 0; + int iEnd = 0; + int iPos = 0; int nWord = 0; char const *zToken; - int nToken; + int nToken = 0; sqlite3_tokenizer *pTokenizer = p->pTokenizer; sqlite3_tokenizer_module const *pModule = pTokenizer->pModule; @@ -659,11 +803,10 @@ static int fts3PendingTermsAdd( return SQLITE_OK; } - rc = pModule->xOpen(pTokenizer, zText, -1, &pCsr); + rc = sqlite3Fts3OpenTokenizer(pTokenizer, iLangid, zText, -1, &pCsr); if( rc!=SQLITE_OK ){ return rc; } - pCsr->pTokenizer = pTokenizer; xNext = pModule->xNext; while( SQLITE_OK==rc @@ -697,7 +840,7 @@ static int fts3PendingTermsAdd( } pModule->xClose(pCsr); - *pnWord = nWord; + *pnWord += nWord; return (rc==SQLITE_DONE ? SQLITE_OK : rc); } @@ -706,18 +849,28 @@ static int fts3PendingTermsAdd( ** fts3PendingTermsAdd() are to add term/position-list pairs for the ** contents of the document with docid iDocid. */ -static int fts3PendingTermsDocid(Fts3Table *p, sqlite_int64 iDocid){ +static int fts3PendingTermsDocid( + Fts3Table *p, /* Full-text table handle */ + int iLangid, /* Language id of row being written */ + sqlite_int64 iDocid /* Docid of row being written */ +){ + assert( iLangid>=0 ); + /* TODO(shess) Explore whether partially flushing the buffer on ** forced-flush would provide better performance. I suspect that if ** we ordered the doclists by size and flushed the largest until the ** buffer was half empty, that would let the less frequent terms ** generate longer doclists. */ - if( iDocid<=p->iPrevDocid || p->nPendingData>p->nMaxPendingData ){ + if( iDocid<=p->iPrevDocid + || p->iPrevLangid!=iLangid + || p->nPendingData>p->nMaxPendingData + ){ int rc = sqlite3Fts3PendingTermsFlush(p); if( rc!=SQLITE_OK ) return rc; } p->iPrevDocid = iDocid; + p->iPrevLangid = iLangid; return SQLITE_OK; } @@ -746,11 +899,16 @@ void sqlite3Fts3PendingTermsClear(Fts3Table *p){ ** Argument apVal is the same as the similarly named argument passed to ** fts3InsertData(). Parameter iDocid is the docid of the new row. */ -static int fts3InsertTerms(Fts3Table *p, sqlite3_value **apVal, u32 *aSz){ +static int fts3InsertTerms( + Fts3Table *p, + int iLangid, + sqlite3_value **apVal, + u32 *aSz +){ int i; /* Iterator variable */ for(i=2; i<p->nColumn+2; i++){ const char *zText = (const char *)sqlite3_value_text(apVal[i]); - int rc = fts3PendingTermsAdd(p, zText, i-2, &aSz[i-2]); + int rc = fts3PendingTermsAdd(p, iLangid, zText, i-2, &aSz[i-2]); if( rc!=SQLITE_OK ){ return rc; } @@ -771,6 +929,7 @@ static int fts3InsertTerms(Fts3Table *p, sqlite3_value **apVal, u32 *aSz){ ** apVal[p->nColumn+1] Right-most user-defined column ** apVal[p->nColumn+2] Hidden column with same name as table ** apVal[p->nColumn+3] Hidden "docid" column (alias for rowid) +** apVal[p->nColumn+4] Hidden languageid column */ static int fts3InsertData( Fts3Table *p, /* Full-text table */ @@ -780,6 +939,18 @@ static int fts3InsertData( int rc; /* Return code */ sqlite3_stmt *pContentInsert; /* INSERT INTO %_content VALUES(...) */ + if( p->zContentTbl ){ + sqlite3_value *pRowid = apVal[p->nColumn+3]; + if( sqlite3_value_type(pRowid)==SQLITE_NULL ){ + pRowid = apVal[1]; + } + if( sqlite3_value_type(pRowid)!=SQLITE_INTEGER ){ + return SQLITE_CONSTRAINT; + } + *piDocid = sqlite3_value_int64(pRowid); + return SQLITE_OK; + } + /* Locate the statement handle used to insert data into the %_content ** table. The SQL for this statement is: ** @@ -789,9 +960,13 @@ static int fts3InsertData( ** defined columns in the FTS3 table, plus one for the docid field. */ rc = fts3SqlStmt(p, SQL_CONTENT_INSERT, &pContentInsert, &apVal[1]); - if( rc!=SQLITE_OK ){ - return rc; + if( rc==SQLITE_OK && p->zLanguageid ){ + rc = sqlite3_bind_int( + pContentInsert, p->nColumn+2, + sqlite3_value_int(apVal[p->nColumn+4]) + ); } + if( rc!=SQLITE_OK ) return rc; /* There is a quirk here. The users INSERT statement may have specified ** a value for the "rowid" field, for the "docid" field, or for both. @@ -830,14 +1005,16 @@ static int fts3InsertData( ** Remove all data from the FTS3 table. Clear the hash table containing ** pending terms. */ -static int fts3DeleteAll(Fts3Table *p){ +static int fts3DeleteAll(Fts3Table *p, int bContent){ int rc = SQLITE_OK; /* Return code */ /* Discard the contents of the pending-terms hash table. */ sqlite3Fts3PendingTermsClear(p); - /* Delete everything from the %_content, %_segments and %_segdir tables. */ - fts3SqlExec(&rc, p, SQL_DELETE_ALL_CONTENT, 0); + /* Delete everything from the shadow tables. Except, leave %_content as + ** is if bContent is false. */ + assert( p->zContentTbl==0 || bContent==0 ); + if( bContent ) fts3SqlExec(&rc, p, SQL_DELETE_ALL_CONTENT, 0); fts3SqlExec(&rc, p, SQL_DELETE_ALL_SEGMENTS, 0); fts3SqlExec(&rc, p, SQL_DELETE_ALL_SEGDIR, 0); if( p->bHasDocsize ){ @@ -850,6 +1027,15 @@ static int fts3DeleteAll(Fts3Table *p){ } /* +** +*/ +static int langidFromSelect(Fts3Table *p, sqlite3_stmt *pSelect){ + int iLangid = 0; + if( p->zLanguageid ) iLangid = sqlite3_column_int(pSelect, p->nColumn+1); + return iLangid; +} + +/* ** The first element in the apVal[] array is assumed to contain the docid ** (an integer) of a row about to be deleted. Remove all terms from the ** full-text index. @@ -858,26 +1044,31 @@ static void fts3DeleteTerms( int *pRC, /* Result code */ Fts3Table *p, /* The FTS table to delete from */ sqlite3_value *pRowid, /* The docid to be deleted */ - u32 *aSz /* Sizes of deleted document written here */ + u32 *aSz, /* Sizes of deleted document written here */ + int *pbFound /* OUT: Set to true if row really does exist */ ){ int rc; sqlite3_stmt *pSelect; + assert( *pbFound==0 ); if( *pRC ) return; rc = fts3SqlStmt(p, SQL_SELECT_CONTENT_BY_ROWID, &pSelect, &pRowid); if( rc==SQLITE_OK ){ if( SQLITE_ROW==sqlite3_step(pSelect) ){ int i; - for(i=1; i<=p->nColumn; i++){ + int iLangid = langidFromSelect(p, pSelect); + rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pSelect, 0)); + for(i=1; rc==SQLITE_OK && i<=p->nColumn; i++){ const char *zText = (const char *)sqlite3_column_text(pSelect, i); - rc = fts3PendingTermsAdd(p, zText, -1, &aSz[i-1]); - if( rc!=SQLITE_OK ){ - sqlite3_reset(pSelect); - *pRC = rc; - return; - } + rc = fts3PendingTermsAdd(p, iLangid, zText, -1, &aSz[i-1]); aSz[p->nColumn] += sqlite3_column_bytes(pSelect, i); } + if( rc!=SQLITE_OK ){ + sqlite3_reset(pSelect); + *pRC = rc; + return; + } + *pbFound = 1; } rc = sqlite3_reset(pSelect); }else{ @@ -890,7 +1081,7 @@ static void fts3DeleteTerms( ** Forward declaration to account for the circular dependency between ** functions fts3SegmentMerge() and fts3AllocateSegdirIdx(). */ -static int fts3SegmentMerge(Fts3Table *, int, int); +static int fts3SegmentMerge(Fts3Table *, int, int, int); /* ** This function allocates a new level iLevel index in the segdir table. @@ -909,6 +1100,7 @@ static int fts3SegmentMerge(Fts3Table *, int, int); */ static int fts3AllocateSegdirIdx( Fts3Table *p, + int iLangid, /* Language id */ int iIndex, /* Index for p->aIndex */ int iLevel, int *piIdx @@ -917,10 +1109,15 @@ static int fts3AllocateSegdirIdx( sqlite3_stmt *pNextIdx; /* Query for next idx at level iLevel */ int iNext = 0; /* Result of query pNextIdx */ + assert( iLangid>=0 ); + assert( p->nIndex>=1 ); + /* Set variable iNext to the next available segdir index at level iLevel. */ rc = fts3SqlStmt(p, SQL_NEXT_SEGMENT_INDEX, &pNextIdx, 0); if( rc==SQLITE_OK ){ - sqlite3_bind_int(pNextIdx, 1, iIndex*FTS3_SEGDIR_MAXLEVEL + iLevel); + sqlite3_bind_int64( + pNextIdx, 1, getAbsoluteLevel(p, iLangid, iIndex, iLevel) + ); if( SQLITE_ROW==sqlite3_step(pNextIdx) ){ iNext = sqlite3_column_int(pNextIdx, 0); } @@ -934,7 +1131,8 @@ static int fts3AllocateSegdirIdx( ** if iNext is less than FTS3_MERGE_COUNT, allocate index iNext. */ if( iNext>=FTS3_MERGE_COUNT ){ - rc = fts3SegmentMerge(p, iIndex, iLevel); + fts3LogMerge(16, getAbsoluteLevel(p, iLangid, iIndex, iLevel)); + rc = fts3SegmentMerge(p, iLangid, iIndex, iLevel); *piIdx = 0; }else{ *piIdx = iNext; @@ -981,7 +1179,7 @@ int sqlite3Fts3ReadBlock( int rc; /* Return code */ /* pnBlob must be non-NULL. paBlob may be NULL or non-NULL. */ - assert( pnBlob); + assert( pnBlob ); if( p->pSegments ){ rc = sqlite3_blob_reopen(p->pSegments, iBlockid); @@ -1068,6 +1266,18 @@ static int fts3SegReaderRequire(Fts3SegReader *pReader, char *pFrom, int nByte){ } /* +** Set an Fts3SegReader cursor to point at EOF. +*/ +static void fts3SegReaderSetEof(Fts3SegReader *pSeg){ + if( !fts3SegReaderIsRootOnly(pSeg) ){ + sqlite3_free(pSeg->aNode); + sqlite3_blob_close(pSeg->pBlob); + pSeg->pBlob = 0; + } + pSeg->aNode = 0; +} + +/* ** Move the iterator passed as the first argument to the next term in the ** segment. If successful, SQLITE_OK is returned. If there is no next term, ** SQLITE_DONE. Otherwise, an SQLite error code. @@ -1106,12 +1316,7 @@ static int fts3SegReaderNext( return SQLITE_OK; } - if( !fts3SegReaderIsRootOnly(pReader) ){ - sqlite3_free(pReader->aNode); - sqlite3_blob_close(pReader->pBlob); - pReader->pBlob = 0; - } - pReader->aNode = 0; + fts3SegReaderSetEof(pReader); /* If iCurrentBlock>=iLeafEndBlock, this is an EOF condition. All leaf ** blocks have already been traversed. */ @@ -1145,7 +1350,7 @@ static int fts3SegReaderNext( if( nPrefix<0 || nSuffix<=0 || &pNext[nSuffix]>&pReader->aNode[pReader->nNode] ){ - return SQLITE_CORRUPT_VTAB; + return FTS_CORRUPT_VTAB; } if( nPrefix+nSuffix>pReader->nTermAlloc ){ @@ -1175,7 +1380,7 @@ static int fts3SegReaderNext( if( &pReader->aDoclist[pReader->nDoclist]>&pReader->aNode[pReader->nNode] || (pReader->nPopulate==0 && pReader->aDoclist[pReader->nDoclist-1]) ){ - return SQLITE_CORRUPT_VTAB; + return FTS_CORRUPT_VTAB; } return SQLITE_OK; } @@ -1315,7 +1520,7 @@ int sqlite3Fts3MsrOvfl( int rc = SQLITE_OK; int pgsz = p->nPgsz; - assert( p->bHasStat ); + assert( p->bFts4 ); assert( pgsz>0 ); for(ii=0; rc==SQLITE_OK && ii<pMsr->nSegment; ii++){ @@ -1358,6 +1563,7 @@ void sqlite3Fts3SegReaderFree(Fts3SegReader *pReader){ */ int sqlite3Fts3SegReaderNew( int iAge, /* Segment "age". */ + int bLookup, /* True for a lookup only */ sqlite3_int64 iStartLeaf, /* First leaf to traverse */ sqlite3_int64 iEndLeaf, /* Final leaf to traverse */ sqlite3_int64 iEndBlock, /* Final block of segment */ @@ -1365,7 +1571,6 @@ int sqlite3Fts3SegReaderNew( int nRoot, /* Size of buffer containing root node */ Fts3SegReader **ppReader /* OUT: Allocated Fts3SegReader */ ){ - int rc = SQLITE_OK; /* Return code */ Fts3SegReader *pReader; /* Newly allocated SegReader object */ int nExtra = 0; /* Bytes to allocate segment root node */ @@ -1380,6 +1585,7 @@ int sqlite3Fts3SegReaderNew( } memset(pReader, 0, sizeof(Fts3SegReader)); pReader->iIdx = iAge; + pReader->bLookup = bLookup!=0; pReader->iStartBlock = iStartLeaf; pReader->iLeafEndBlock = iEndLeaf; pReader->iEndBlock = iEndBlock; @@ -1387,19 +1593,15 @@ int sqlite3Fts3SegReaderNew( if( nExtra ){ /* The entire segment is stored in the root node. */ pReader->aNode = (char *)&pReader[1]; + pReader->rootOnly = 1; pReader->nNode = nRoot; memcpy(pReader->aNode, zRoot, nRoot); memset(&pReader->aNode[nRoot], 0, FTS3_NODE_PADDING); }else{ pReader->iCurrentBlock = iStartLeaf-1; } - - if( rc==SQLITE_OK ){ - *ppReader = pReader; - }else{ - sqlite3Fts3SegReaderFree(pReader); - } - return rc; + *ppReader = pReader; + return SQLITE_OK; } /* @@ -1449,6 +1651,7 @@ int sqlite3Fts3SegReaderPending( Fts3SegReader **ppReader /* OUT: SegReader for pending-terms */ ){ Fts3SegReader *pReader = 0; /* Fts3SegReader object to return */ + Fts3HashElem *pE; /* Iterator variable */ Fts3HashElem **aElem = 0; /* Array of term hash entries to scan */ int nElem = 0; /* Size of array at aElem */ int rc = SQLITE_OK; /* Return Code */ @@ -1457,7 +1660,6 @@ int sqlite3Fts3SegReaderPending( pHash = &p->aIndex[iIndex].hPending; if( bPrefix ){ int nAlloc = 0; /* Size of allocated array at aElem */ - Fts3HashElem *pE = 0; /* Iterator variable */ for(pE=fts3HashFirst(pHash); pE; pE=fts3HashNext(pE)){ char *zKey = (char *)fts3HashKey(pE); @@ -1491,8 +1693,13 @@ int sqlite3Fts3SegReaderPending( }else{ /* The query is a simple term lookup that matches at most one term in - ** the index. All that is required is a straight hash-lookup. */ - Fts3HashElem *pE = fts3HashFindElem(pHash, zTerm, nTerm); + ** the index. All that is required is a straight hash-lookup. + ** + ** Because the stack address of pE may be accessed via the aElem pointer + ** below, the "Fts3HashElem *pE" must be declared so that it is valid + ** within this entire function, not just this "else{...}" block. + */ + pE = fts3HashFindElem(pHash, zTerm, nTerm); if( pE ){ aElem = &pE; nElem = 1; @@ -1672,12 +1879,33 @@ static int fts3WriteSegment( return rc; } +/* +** Find the largest relative level number in the table. If successful, set +** *pnMax to this value and return SQLITE_OK. Otherwise, if an error occurs, +** set *pnMax to zero and return an SQLite error code. +*/ +int sqlite3Fts3MaxLevel(Fts3Table *p, int *pnMax){ + int rc; + int mxLevel = 0; + sqlite3_stmt *pStmt = 0; + + rc = fts3SqlStmt(p, SQL_SELECT_MXLEVEL, &pStmt, 0); + if( rc==SQLITE_OK ){ + if( SQLITE_ROW==sqlite3_step(pStmt) ){ + mxLevel = sqlite3_column_int(pStmt, 0); + } + rc = sqlite3_reset(pStmt); + } + *pnMax = mxLevel; + return rc; +} + /* ** Insert a record into the %_segdir table. */ static int fts3WriteSegdir( Fts3Table *p, /* Virtual table handle */ - int iLevel, /* Value for "level" field */ + sqlite3_int64 iLevel, /* Value for "level" field (absolute level) */ int iIdx, /* Value for "idx" field */ sqlite3_int64 iStartBlock, /* Value for "start_block" field */ sqlite3_int64 iLeafEndBlock, /* Value for "leaves_end_block" field */ @@ -1688,7 +1916,7 @@ static int fts3WriteSegdir( sqlite3_stmt *pStmt; int rc = fts3SqlStmt(p, SQL_INSERT_SEGDIR, &pStmt, 0); if( rc==SQLITE_OK ){ - sqlite3_bind_int(pStmt, 1, iLevel); + sqlite3_bind_int64(pStmt, 1, iLevel); sqlite3_bind_int(pStmt, 2, iIdx); sqlite3_bind_int64(pStmt, 3, iStartBlock); sqlite3_bind_int64(pStmt, 4, iLeafEndBlock); @@ -1988,6 +2216,7 @@ static int fts3SegWriterAdd( /* The current leaf node is full. Write it out to the database. */ rc = fts3WriteSegment(p, pWriter->iFree++, pWriter->aData, nData); if( rc!=SQLITE_OK ) return rc; + p->nLeafAdd++; /* Add the current term to the interior node tree. The term added to ** the interior tree must: @@ -2071,7 +2300,7 @@ static int fts3SegWriterAdd( static int fts3SegWriterFlush( Fts3Table *p, /* Virtual table handle */ SegmentWriter *pWriter, /* SegmentWriter to flush to the db */ - int iLevel, /* Value for 'level' column of %_segdir */ + sqlite3_int64 iLevel, /* Value for 'level' column of %_segdir */ int iIdx /* Value for 'idx' column of %_segdir */ ){ int rc; /* Return code */ @@ -2096,6 +2325,7 @@ static int fts3SegWriterFlush( rc = fts3WriteSegdir( p, iLevel, iIdx, 0, 0, 0, pWriter->aData, pWriter->nData); } + p->nLeafAdd++; return rc; } @@ -2125,12 +2355,18 @@ static void fts3SegWriterFree(SegmentWriter *pWriter){ static int fts3IsEmpty(Fts3Table *p, sqlite3_value *pRowid, int *pisEmpty){ sqlite3_stmt *pStmt; int rc; - rc = fts3SqlStmt(p, SQL_IS_EMPTY, &pStmt, &pRowid); - if( rc==SQLITE_OK ){ - if( SQLITE_ROW==sqlite3_step(pStmt) ){ - *pisEmpty = sqlite3_column_int(pStmt, 0); + if( p->zContentTbl ){ + /* If using the content=xxx option, assume the table is never empty */ + *pisEmpty = 0; + rc = SQLITE_OK; + }else{ + rc = fts3SqlStmt(p, SQL_IS_EMPTY, &pStmt, &pRowid); + if( rc==SQLITE_OK ){ + if( SQLITE_ROW==sqlite3_step(pStmt) ){ + *pisEmpty = sqlite3_column_int(pStmt, 0); + } + rc = sqlite3_reset(pStmt); } - rc = sqlite3_reset(pStmt); } return rc; } @@ -2143,7 +2379,12 @@ static int fts3IsEmpty(Fts3Table *p, sqlite3_value *pRowid, int *pisEmpty){ ** ** Return SQLITE_OK if successful, or an SQLite error code if not. */ -static int fts3SegmentMaxLevel(Fts3Table *p, int iIndex, int *pnMax){ +static int fts3SegmentMaxLevel( + Fts3Table *p, + int iLangid, + int iIndex, + sqlite3_int64 *pnMax +){ sqlite3_stmt *pStmt; int rc; assert( iIndex>=0 && iIndex<p->nIndex ); @@ -2156,15 +2397,40 @@ static int fts3SegmentMaxLevel(Fts3Table *p, int iIndex, int *pnMax){ */ rc = fts3SqlStmt(p, SQL_SELECT_SEGDIR_MAX_LEVEL, &pStmt, 0); if( rc!=SQLITE_OK ) return rc; - sqlite3_bind_int(pStmt, 1, iIndex*FTS3_SEGDIR_MAXLEVEL); - sqlite3_bind_int(pStmt, 2, (iIndex+1)*FTS3_SEGDIR_MAXLEVEL - 1); + sqlite3_bind_int64(pStmt, 1, getAbsoluteLevel(p, iLangid, iIndex, 0)); + sqlite3_bind_int64(pStmt, 2, + getAbsoluteLevel(p, iLangid, iIndex, FTS3_SEGDIR_MAXLEVEL-1) + ); if( SQLITE_ROW==sqlite3_step(pStmt) ){ - *pnMax = sqlite3_column_int(pStmt, 0); + *pnMax = sqlite3_column_int64(pStmt, 0); } return sqlite3_reset(pStmt); } /* +** Delete all entries in the %_segments table associated with the segment +** opened with seg-reader pSeg. This function does not affect the contents +** of the %_segdir table. +*/ +static int fts3DeleteSegment( + Fts3Table *p, /* FTS table handle */ + Fts3SegReader *pSeg /* Segment to delete */ +){ + int rc = SQLITE_OK; /* Return code */ + if( pSeg->iStartBlock ){ + sqlite3_stmt *pDelete; /* SQL statement to delete rows */ + rc = fts3SqlStmt(p, SQL_DELETE_SEGMENTS_RANGE, &pDelete, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pDelete, 1, pSeg->iStartBlock); + sqlite3_bind_int64(pDelete, 2, pSeg->iEndBlock); + sqlite3_step(pDelete); + rc = sqlite3_reset(pDelete); + } + } + return rc; +} + +/* ** This function is used after merging multiple segments into a single large ** segment to delete the old, now redundant, segment b-trees. Specifically, ** it: @@ -2180,24 +2446,18 @@ static int fts3SegmentMaxLevel(Fts3Table *p, int iIndex, int *pnMax){ */ static int fts3DeleteSegdir( Fts3Table *p, /* Virtual table handle */ + int iLangid, /* Language id */ int iIndex, /* Index for p->aIndex */ int iLevel, /* Level of %_segdir entries to delete */ Fts3SegReader **apSegment, /* Array of SegReader objects */ int nReader /* Size of array apSegment */ ){ - int rc; /* Return Code */ + int rc = SQLITE_OK; /* Return Code */ int i; /* Iterator variable */ - sqlite3_stmt *pDelete; /* SQL statement to delete rows */ + sqlite3_stmt *pDelete = 0; /* SQL statement to delete rows */ - rc = fts3SqlStmt(p, SQL_DELETE_SEGMENTS_RANGE, &pDelete, 0); for(i=0; rc==SQLITE_OK && i<nReader; i++){ - Fts3SegReader *pSegment = apSegment[i]; - if( pSegment->iStartBlock ){ - sqlite3_bind_int64(pDelete, 1, pSegment->iStartBlock); - sqlite3_bind_int64(pDelete, 2, pSegment->iEndBlock); - sqlite3_step(pDelete); - rc = sqlite3_reset(pDelete); - } + rc = fts3DeleteSegment(p, apSegment[i]); } if( rc!=SQLITE_OK ){ return rc; @@ -2207,13 +2467,17 @@ static int fts3DeleteSegdir( if( iLevel==FTS3_SEGCURSOR_ALL ){ rc = fts3SqlStmt(p, SQL_DELETE_SEGDIR_RANGE, &pDelete, 0); if( rc==SQLITE_OK ){ - sqlite3_bind_int(pDelete, 1, iIndex*FTS3_SEGDIR_MAXLEVEL); - sqlite3_bind_int(pDelete, 2, (iIndex+1) * FTS3_SEGDIR_MAXLEVEL - 1); + sqlite3_bind_int64(pDelete, 1, getAbsoluteLevel(p, iLangid, iIndex, 0)); + sqlite3_bind_int64(pDelete, 2, + getAbsoluteLevel(p, iLangid, iIndex, FTS3_SEGDIR_MAXLEVEL-1) + ); } }else{ rc = fts3SqlStmt(p, SQL_DELETE_SEGDIR_LEVEL, &pDelete, 0); if( rc==SQLITE_OK ){ - sqlite3_bind_int(pDelete, 1, iIndex*FTS3_SEGDIR_MAXLEVEL + iLevel); + sqlite3_bind_int64( + pDelete, 1, getAbsoluteLevel(p, iLangid, iIndex, iLevel) + ); } } @@ -2376,11 +2640,16 @@ static int fts3SegReaderStart( ** b-tree leaf nodes contain more than one term. */ for(i=0; pCsr->bRestart==0 && i<pCsr->nSegment; i++){ + int res = 0; Fts3SegReader *pSeg = pCsr->apSegment[i]; do { int rc = fts3SegReaderNext(p, pSeg, 0); if( rc!=SQLITE_OK ) return rc; - }while( zTerm && fts3SegReaderTermCmp(pSeg, zTerm, nTerm)<0 ); + }while( zTerm && (res = fts3SegReaderTermCmp(pSeg, zTerm, nTerm))<0 ); + + if( pSeg->bLookup && res!=0 ){ + fts3SegReaderSetEof(pSeg); + } } fts3SegReaderSort(pCsr->apSegment, nSeg, nSeg, fts3SegReaderCmp); @@ -2482,6 +2751,7 @@ int sqlite3Fts3SegReaderStep( int isColFilter = (pCsr->pFilter->flags & FTS3_SEGMENT_COLUMN_FILTER); int isPrefix = (pCsr->pFilter->flags & FTS3_SEGMENT_PREFIX); int isScan = (pCsr->pFilter->flags & FTS3_SEGMENT_SCAN); + int isFirst = (pCsr->pFilter->flags & FTS3_SEGMENT_FIRST); Fts3SegReader **apSegment = pCsr->apSegment; int nSegment = pCsr->nSegment; @@ -2500,7 +2770,12 @@ int sqlite3Fts3SegReaderStep( ** forward. Then sort the list in order of current term again. */ for(i=0; i<pCsr->nAdvance; i++){ - rc = fts3SegReaderNext(p, apSegment[i], 0); + Fts3SegReader *pSeg = apSegment[i]; + if( pSeg->bLookup ){ + fts3SegReaderSetEof(pSeg); + }else{ + rc = fts3SegReaderNext(p, pSeg, 0); + } if( rc!=SQLITE_OK ) return rc; } fts3SegReaderSort(apSegment, nSegment, pCsr->nAdvance, fts3SegReaderCmp); @@ -2541,6 +2816,7 @@ int sqlite3Fts3SegReaderStep( assert( isIgnoreEmpty || (isRequirePos && !isColFilter) ); if( nMerge==1 && !isIgnoreEmpty + && !isFirst && (p->bDescIdx==0 || fts3SegReaderIsPending(apSegment[0])==0) ){ pCsr->nDoclist = apSegment[0]->nDoclist; @@ -2606,12 +2882,24 @@ int sqlite3Fts3SegReaderStep( } pCsr->aBuffer = aNew; } - nDoclist += sqlite3Fts3PutVarint(&pCsr->aBuffer[nDoclist], iDelta); - iPrev = iDocid; - if( isRequirePos ){ - memcpy(&pCsr->aBuffer[nDoclist], pList, nList); - nDoclist += nList; - pCsr->aBuffer[nDoclist++] = '\0'; + + if( isFirst ){ + char *a = &pCsr->aBuffer[nDoclist]; + int nWrite; + + nWrite = sqlite3Fts3FirstFilter(iDelta, pList, nList, a); + if( nWrite ){ + iPrev = iDocid; + nDoclist += nWrite; + } + }else{ + nDoclist += sqlite3Fts3PutVarint(&pCsr->aBuffer[nDoclist], iDelta); + iPrev = iDocid; + if( isRequirePos ){ + memcpy(&pCsr->aBuffer[nDoclist], pList, nList); + nDoclist += nList; + pCsr->aBuffer[nDoclist++] = '\0'; + } } } @@ -2658,13 +2946,18 @@ void sqlite3Fts3SegReaderFinish( ** Otherwise, if successful, SQLITE_OK is returned. If an error occurs, ** an SQLite error code is returned. */ -static int fts3SegmentMerge(Fts3Table *p, int iIndex, int iLevel){ +static int fts3SegmentMerge( + Fts3Table *p, + int iLangid, /* Language id to merge */ + int iIndex, /* Index in p->aIndex[] to merge */ + int iLevel /* Level to merge */ +){ int rc; /* Return code */ int iIdx = 0; /* Index of new segment */ - int iNewLevel = 0; /* Level/index to create new segment at */ + sqlite3_int64 iNewLevel = 0; /* Level/index to create new segment at */ SegmentWriter *pWriter = 0; /* Used to write the new, merged, segment */ Fts3SegFilter filter; /* Segment term filter condition */ - Fts3MultiSegReader csr; /* Cursor to iterate through level(s) */ + Fts3MultiSegReader csr; /* Cursor to iterate through level(s) */ int bIgnoreEmpty = 0; /* True to ignore empty segments */ assert( iLevel==FTS3_SEGCURSOR_ALL @@ -2674,36 +2967,36 @@ static int fts3SegmentMerge(Fts3Table *p, int iIndex, int iLevel){ assert( iLevel<FTS3_SEGDIR_MAXLEVEL ); assert( iIndex>=0 && iIndex<p->nIndex ); - rc = sqlite3Fts3SegReaderCursor(p, iIndex, iLevel, 0, 0, 1, 0, &csr); + rc = sqlite3Fts3SegReaderCursor(p, iLangid, iIndex, iLevel, 0, 0, 1, 0, &csr); if( rc!=SQLITE_OK || csr.nSegment==0 ) goto finished; if( iLevel==FTS3_SEGCURSOR_ALL ){ /* This call is to merge all segments in the database to a single - ** segment. The level of the new segment is equal to the the numerically + ** segment. The level of the new segment is equal to the numerically ** greatest segment level currently present in the database for this ** index. The idx of the new segment is always 0. */ if( csr.nSegment==1 ){ rc = SQLITE_DONE; goto finished; } - rc = fts3SegmentMaxLevel(p, iIndex, &iNewLevel); + rc = fts3SegmentMaxLevel(p, iLangid, iIndex, &iNewLevel); bIgnoreEmpty = 1; }else if( iLevel==FTS3_SEGCURSOR_PENDING ){ - iNewLevel = iIndex * FTS3_SEGDIR_MAXLEVEL; - rc = fts3AllocateSegdirIdx(p, iIndex, 0, &iIdx); + iNewLevel = getAbsoluteLevel(p, iLangid, iIndex, 0); + rc = fts3AllocateSegdirIdx(p, iLangid, iIndex, 0, &iIdx); }else{ /* This call is to merge all segments at level iLevel. find the next ** available segment index at level iLevel+1. The call to ** fts3AllocateSegdirIdx() will merge the segments at level iLevel+1 to ** a single iLevel+2 segment if necessary. */ - rc = fts3AllocateSegdirIdx(p, iIndex, iLevel+1, &iIdx); - iNewLevel = iIndex * FTS3_SEGDIR_MAXLEVEL + iLevel+1; + rc = fts3AllocateSegdirIdx(p, iLangid, iIndex, iLevel+1, &iIdx); + iNewLevel = getAbsoluteLevel(p, iLangid, iIndex, iLevel+1); } if( rc!=SQLITE_OK ) goto finished; assert( csr.nSegment>0 ); - assert( iNewLevel>=(iIndex*FTS3_SEGDIR_MAXLEVEL) ); - assert( iNewLevel<((iIndex+1)*FTS3_SEGDIR_MAXLEVEL) ); + assert( iNewLevel>=getAbsoluteLevel(p, iLangid, iIndex, 0) ); + assert( iNewLevel<getAbsoluteLevel(p, iLangid, iIndex,FTS3_SEGDIR_MAXLEVEL) ); memset(&filter, 0, sizeof(Fts3SegFilter)); filter.flags = FTS3_SEGMENT_REQUIRE_POS; @@ -2720,7 +3013,9 @@ static int fts3SegmentMerge(Fts3Table *p, int iIndex, int iLevel){ assert( pWriter ); if( iLevel!=FTS3_SEGCURSOR_PENDING ){ - rc = fts3DeleteSegdir(p, iIndex, iLevel, csr.apSegment, csr.nSegment); + rc = fts3DeleteSegdir( + p, iLangid, iIndex, iLevel, csr.apSegment, csr.nSegment + ); if( rc!=SQLITE_OK ) goto finished; } rc = fts3SegWriterFlush(p, pWriter, iNewLevel, iIdx); @@ -2738,11 +3033,28 @@ static int fts3SegmentMerge(Fts3Table *p, int iIndex, int iLevel){ int sqlite3Fts3PendingTermsFlush(Fts3Table *p){ int rc = SQLITE_OK; int i; + for(i=0; rc==SQLITE_OK && i<p->nIndex; i++){ - rc = fts3SegmentMerge(p, i, FTS3_SEGCURSOR_PENDING); + rc = fts3SegmentMerge(p, p->iPrevLangid, i, FTS3_SEGCURSOR_PENDING); if( rc==SQLITE_DONE ) rc = SQLITE_OK; } sqlite3Fts3PendingTermsClear(p); + + /* Determine the auto-incr-merge setting if unknown. If enabled, + ** estimate the number of leaf blocks of content to be written + */ + if( rc==SQLITE_OK && p->bHasStat + && p->bAutoincrmerge==0xff && p->nLeafAdd>0 + ){ + sqlite3_stmt *pStmt = 0; + rc = fts3SqlStmt(p, SQL_SELECT_STAT, &pStmt, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int(pStmt, 1, FTS_STAT_AUTOINCRMERGE); + rc = sqlite3_step(pStmt); + p->bAutoincrmerge = (rc==SQLITE_ROW && sqlite3_column_int(pStmt, 0)); + rc = sqlite3_reset(pStmt); + } + } return rc; } @@ -2787,9 +3099,9 @@ static void fts3DecodeIntArray( ** a blob of varints. */ static void fts3InsertDocsize( - int *pRC, /* Result code */ - Fts3Table *p, /* Table into which to insert */ - u32 *aSz /* Sizes of each column */ + int *pRC, /* Result code */ + Fts3Table *p, /* Table into which to insert */ + u32 *aSz /* Sizes of each column, in tokens */ ){ char *pBlob; /* The BLOB encoding of the document size */ int nBlob; /* Number of bytes in the BLOB */ @@ -2853,12 +3165,13 @@ static void fts3UpdateDocTotals( return; } pBlob = (char*)&a[nStat]; - rc = fts3SqlStmt(p, SQL_SELECT_DOCTOTAL, &pStmt, 0); + rc = fts3SqlStmt(p, SQL_SELECT_STAT, &pStmt, 0); if( rc ){ sqlite3_free(a); *pRC = rc; return; } + sqlite3_bind_int(pStmt, 1, FTS_STAT_DOCTOTAL); if( sqlite3_step(pStmt)==SQLITE_ROW ){ fts3DecodeIntArray(nStat, a, sqlite3_column_blob(pStmt, 0), @@ -2866,7 +3179,12 @@ static void fts3UpdateDocTotals( }else{ memset(a, 0, sizeof(u32)*(nStat) ); } - sqlite3_reset(pStmt); + rc = sqlite3_reset(pStmt); + if( rc!=SQLITE_OK ){ + sqlite3_free(a); + *pRC = rc; + return; + } if( nChng<0 && a[0]<(u32)(-nChng) ){ a[0] = 0; }else{ @@ -2882,29 +3200,47 @@ static void fts3UpdateDocTotals( a[i+1] = x; } fts3EncodeIntArray(nStat, a, pBlob, &nBlob); - rc = fts3SqlStmt(p, SQL_REPLACE_DOCTOTAL, &pStmt, 0); + rc = fts3SqlStmt(p, SQL_REPLACE_STAT, &pStmt, 0); if( rc ){ sqlite3_free(a); *pRC = rc; return; } - sqlite3_bind_blob(pStmt, 1, pBlob, nBlob, SQLITE_STATIC); + sqlite3_bind_int(pStmt, 1, FTS_STAT_DOCTOTAL); + sqlite3_bind_blob(pStmt, 2, pBlob, nBlob, SQLITE_STATIC); sqlite3_step(pStmt); *pRC = sqlite3_reset(pStmt); sqlite3_free(a); } +/* +** Merge the entire database so that there is one segment for each +** iIndex/iLangid combination. +*/ static int fts3DoOptimize(Fts3Table *p, int bReturnDone){ - int i; int bSeenDone = 0; - int rc = SQLITE_OK; - for(i=0; rc==SQLITE_OK && i<p->nIndex; i++){ - rc = fts3SegmentMerge(p, i, FTS3_SEGCURSOR_ALL); - if( rc==SQLITE_DONE ){ - bSeenDone = 1; - rc = SQLITE_OK; + int rc; + sqlite3_stmt *pAllLangid = 0; + + rc = fts3SqlStmt(p, SQL_SELECT_ALL_LANGID, &pAllLangid, 0); + if( rc==SQLITE_OK ){ + int rc2; + sqlite3_bind_int(pAllLangid, 1, p->nIndex); + while( sqlite3_step(pAllLangid)==SQLITE_ROW ){ + int i; + int iLangid = sqlite3_column_int(pAllLangid, 0); + for(i=0; rc==SQLITE_OK && i<p->nIndex; i++){ + rc = fts3SegmentMerge(p, iLangid, i, FTS3_SEGCURSOR_ALL); + if( rc==SQLITE_DONE ){ + bSeenDone = 1; + rc = SQLITE_OK; + } + } } + rc2 = sqlite3_reset(pAllLangid); + if( rc==SQLITE_OK ) rc = rc2; } + sqlite3Fts3SegmentsClose(p); sqlite3Fts3PendingTermsClear(p); @@ -2912,6 +3248,1768 @@ static int fts3DoOptimize(Fts3Table *p, int bReturnDone){ } /* +** This function is called when the user executes the following statement: +** +** INSERT INTO <tbl>(<tbl>) VALUES('rebuild'); +** +** The entire FTS index is discarded and rebuilt. If the table is one +** created using the content=xxx option, then the new index is based on +** the current contents of the xxx table. Otherwise, it is rebuilt based +** on the contents of the %_content table. +*/ +static int fts3DoRebuild(Fts3Table *p){ + int rc; /* Return Code */ + + rc = fts3DeleteAll(p, 0); + if( rc==SQLITE_OK ){ + u32 *aSz = 0; + u32 *aSzIns = 0; + u32 *aSzDel = 0; + sqlite3_stmt *pStmt = 0; + int nEntry = 0; + + /* Compose and prepare an SQL statement to loop through the content table */ + char *zSql = sqlite3_mprintf("SELECT %s" , p->zReadExprlist); + if( !zSql ){ + rc = SQLITE_NOMEM; + }else{ + rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0); + sqlite3_free(zSql); + } + + if( rc==SQLITE_OK ){ + int nByte = sizeof(u32) * (p->nColumn+1)*3; + aSz = (u32 *)sqlite3_malloc(nByte); + if( aSz==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(aSz, 0, nByte); + aSzIns = &aSz[p->nColumn+1]; + aSzDel = &aSzIns[p->nColumn+1]; + } + } + + while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){ + int iCol; + int iLangid = langidFromSelect(p, pStmt); + rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pStmt, 0)); + memset(aSz, 0, sizeof(aSz[0]) * (p->nColumn+1)); + for(iCol=0; rc==SQLITE_OK && iCol<p->nColumn; iCol++){ + const char *z = (const char *) sqlite3_column_text(pStmt, iCol+1); + rc = fts3PendingTermsAdd(p, iLangid, z, iCol, &aSz[iCol]); + aSz[p->nColumn] += sqlite3_column_bytes(pStmt, iCol+1); + } + if( p->bHasDocsize ){ + fts3InsertDocsize(&rc, p, aSz); + } + if( rc!=SQLITE_OK ){ + sqlite3_finalize(pStmt); + pStmt = 0; + }else{ + nEntry++; + for(iCol=0; iCol<=p->nColumn; iCol++){ + aSzIns[iCol] += aSz[iCol]; + } + } + } + if( p->bFts4 ){ + fts3UpdateDocTotals(&rc, p, aSzIns, aSzDel, nEntry); + } + sqlite3_free(aSz); + + if( pStmt ){ + int rc2 = sqlite3_finalize(pStmt); + if( rc==SQLITE_OK ){ + rc = rc2; + } + } + } + + return rc; +} + + +/* +** This function opens a cursor used to read the input data for an +** incremental merge operation. Specifically, it opens a cursor to scan +** the oldest nSeg segments (idx=0 through idx=(nSeg-1)) in absolute +** level iAbsLevel. +*/ +static int fts3IncrmergeCsr( + Fts3Table *p, /* FTS3 table handle */ + sqlite3_int64 iAbsLevel, /* Absolute level to open */ + int nSeg, /* Number of segments to merge */ + Fts3MultiSegReader *pCsr /* Cursor object to populate */ +){ + int rc; /* Return Code */ + sqlite3_stmt *pStmt = 0; /* Statement used to read %_segdir entry */ + int nByte; /* Bytes allocated at pCsr->apSegment[] */ + + /* Allocate space for the Fts3MultiSegReader.aCsr[] array */ + memset(pCsr, 0, sizeof(*pCsr)); + nByte = sizeof(Fts3SegReader *) * nSeg; + pCsr->apSegment = (Fts3SegReader **)sqlite3_malloc(nByte); + + if( pCsr->apSegment==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(pCsr->apSegment, 0, nByte); + rc = fts3SqlStmt(p, SQL_SELECT_LEVEL, &pStmt, 0); + } + if( rc==SQLITE_OK ){ + int i; + int rc2; + sqlite3_bind_int64(pStmt, 1, iAbsLevel); + assert( pCsr->nSegment==0 ); + for(i=0; rc==SQLITE_OK && sqlite3_step(pStmt)==SQLITE_ROW && i<nSeg; i++){ + rc = sqlite3Fts3SegReaderNew(i, 0, + sqlite3_column_int64(pStmt, 1), /* segdir.start_block */ + sqlite3_column_int64(pStmt, 2), /* segdir.leaves_end_block */ + sqlite3_column_int64(pStmt, 3), /* segdir.end_block */ + sqlite3_column_blob(pStmt, 4), /* segdir.root */ + sqlite3_column_bytes(pStmt, 4), /* segdir.root */ + &pCsr->apSegment[i] + ); + pCsr->nSegment++; + } + rc2 = sqlite3_reset(pStmt); + if( rc==SQLITE_OK ) rc = rc2; + } + + return rc; +} + +typedef struct IncrmergeWriter IncrmergeWriter; +typedef struct NodeWriter NodeWriter; +typedef struct Blob Blob; +typedef struct NodeReader NodeReader; + +/* +** An instance of the following structure is used as a dynamic buffer +** to build up nodes or other blobs of data in. +** +** The function blobGrowBuffer() is used to extend the allocation. +*/ +struct Blob { + char *a; /* Pointer to allocation */ + int n; /* Number of valid bytes of data in a[] */ + int nAlloc; /* Allocated size of a[] (nAlloc>=n) */ +}; + +/* +** This structure is used to build up buffers containing segment b-tree +** nodes (blocks). +*/ +struct NodeWriter { + sqlite3_int64 iBlock; /* Current block id */ + Blob key; /* Last key written to the current block */ + Blob block; /* Current block image */ +}; + +/* +** An object of this type contains the state required to create or append +** to an appendable b-tree segment. +*/ +struct IncrmergeWriter { + int nLeafEst; /* Space allocated for leaf blocks */ + int nWork; /* Number of leaf pages flushed */ + sqlite3_int64 iAbsLevel; /* Absolute level of input segments */ + int iIdx; /* Index of *output* segment in iAbsLevel+1 */ + sqlite3_int64 iStart; /* Block number of first allocated block */ + sqlite3_int64 iEnd; /* Block number of last allocated block */ + NodeWriter aNodeWriter[FTS_MAX_APPENDABLE_HEIGHT]; +}; + +/* +** An object of the following type is used to read data from a single +** FTS segment node. See the following functions: +** +** nodeReaderInit() +** nodeReaderNext() +** nodeReaderRelease() +*/ +struct NodeReader { + const char *aNode; + int nNode; + int iOff; /* Current offset within aNode[] */ + + /* Output variables. Containing the current node entry. */ + sqlite3_int64 iChild; /* Pointer to child node */ + Blob term; /* Current term */ + const char *aDoclist; /* Pointer to doclist */ + int nDoclist; /* Size of doclist in bytes */ +}; + +/* +** If *pRc is not SQLITE_OK when this function is called, it is a no-op. +** Otherwise, if the allocation at pBlob->a is not already at least nMin +** bytes in size, extend (realloc) it to be so. +** +** If an OOM error occurs, set *pRc to SQLITE_NOMEM and leave pBlob->a +** unmodified. Otherwise, if the allocation succeeds, update pBlob->nAlloc +** to reflect the new size of the pBlob->a[] buffer. +*/ +static void blobGrowBuffer(Blob *pBlob, int nMin, int *pRc){ + if( *pRc==SQLITE_OK && nMin>pBlob->nAlloc ){ + int nAlloc = nMin; + char *a = (char *)sqlite3_realloc(pBlob->a, nAlloc); + if( a ){ + pBlob->nAlloc = nAlloc; + pBlob->a = a; + }else{ + *pRc = SQLITE_NOMEM; + } + } +} + +/* +** Attempt to advance the node-reader object passed as the first argument to +** the next entry on the node. +** +** Return an error code if an error occurs (SQLITE_NOMEM is possible). +** Otherwise return SQLITE_OK. If there is no next entry on the node +** (e.g. because the current entry is the last) set NodeReader->aNode to +** NULL to indicate EOF. Otherwise, populate the NodeReader structure output +** variables for the new entry. +*/ +static int nodeReaderNext(NodeReader *p){ + int bFirst = (p->term.n==0); /* True for first term on the node */ + int nPrefix = 0; /* Bytes to copy from previous term */ + int nSuffix = 0; /* Bytes to append to the prefix */ + int rc = SQLITE_OK; /* Return code */ + + assert( p->aNode ); + if( p->iChild && bFirst==0 ) p->iChild++; + if( p->iOff>=p->nNode ){ + /* EOF */ + p->aNode = 0; + }else{ + if( bFirst==0 ){ + p->iOff += sqlite3Fts3GetVarint32(&p->aNode[p->iOff], &nPrefix); + } + p->iOff += sqlite3Fts3GetVarint32(&p->aNode[p->iOff], &nSuffix); + + blobGrowBuffer(&p->term, nPrefix+nSuffix, &rc); + if( rc==SQLITE_OK ){ + memcpy(&p->term.a[nPrefix], &p->aNode[p->iOff], nSuffix); + p->term.n = nPrefix+nSuffix; + p->iOff += nSuffix; + if( p->iChild==0 ){ + p->iOff += sqlite3Fts3GetVarint32(&p->aNode[p->iOff], &p->nDoclist); + p->aDoclist = &p->aNode[p->iOff]; + p->iOff += p->nDoclist; + } + } + } + + assert( p->iOff<=p->nNode ); + + return rc; +} + +/* +** Release all dynamic resources held by node-reader object *p. +*/ +static void nodeReaderRelease(NodeReader *p){ + sqlite3_free(p->term.a); +} + +/* +** Initialize a node-reader object to read the node in buffer aNode/nNode. +** +** If successful, SQLITE_OK is returned and the NodeReader object set to +** point to the first entry on the node (if any). Otherwise, an SQLite +** error code is returned. +*/ +static int nodeReaderInit(NodeReader *p, const char *aNode, int nNode){ + memset(p, 0, sizeof(NodeReader)); + p->aNode = aNode; + p->nNode = nNode; + + /* Figure out if this is a leaf or an internal node. */ + if( p->aNode[0] ){ + /* An internal node. */ + p->iOff = 1 + sqlite3Fts3GetVarint(&p->aNode[1], &p->iChild); + }else{ + p->iOff = 1; + } + + return nodeReaderNext(p); +} + +/* +** This function is called while writing an FTS segment each time a leaf o +** node is finished and written to disk. The key (zTerm/nTerm) is guaranteed +** to be greater than the largest key on the node just written, but smaller +** than or equal to the first key that will be written to the next leaf +** node. +** +** The block id of the leaf node just written to disk may be found in +** (pWriter->aNodeWriter[0].iBlock) when this function is called. +*/ +static int fts3IncrmergePush( + Fts3Table *p, /* Fts3 table handle */ + IncrmergeWriter *pWriter, /* Writer object */ + const char *zTerm, /* Term to write to internal node */ + int nTerm /* Bytes at zTerm */ +){ + sqlite3_int64 iPtr = pWriter->aNodeWriter[0].iBlock; + int iLayer; + + assert( nTerm>0 ); + for(iLayer=1; ALWAYS(iLayer<FTS_MAX_APPENDABLE_HEIGHT); iLayer++){ + sqlite3_int64 iNextPtr = 0; + NodeWriter *pNode = &pWriter->aNodeWriter[iLayer]; + int rc = SQLITE_OK; + int nPrefix; + int nSuffix; + int nSpace; + + /* Figure out how much space the key will consume if it is written to + ** the current node of layer iLayer. Due to the prefix compression, + ** the space required changes depending on which node the key is to + ** be added to. */ + nPrefix = fts3PrefixCompress(pNode->key.a, pNode->key.n, zTerm, nTerm); + nSuffix = nTerm - nPrefix; + nSpace = sqlite3Fts3VarintLen(nPrefix); + nSpace += sqlite3Fts3VarintLen(nSuffix) + nSuffix; + + if( pNode->key.n==0 || (pNode->block.n + nSpace)<=p->nNodeSize ){ + /* If the current node of layer iLayer contains zero keys, or if adding + ** the key to it will not cause it to grow to larger than nNodeSize + ** bytes in size, write the key here. */ + + Blob *pBlk = &pNode->block; + if( pBlk->n==0 ){ + blobGrowBuffer(pBlk, p->nNodeSize, &rc); + if( rc==SQLITE_OK ){ + pBlk->a[0] = (char)iLayer; + pBlk->n = 1 + sqlite3Fts3PutVarint(&pBlk->a[1], iPtr); + } + } + blobGrowBuffer(pBlk, pBlk->n + nSpace, &rc); + blobGrowBuffer(&pNode->key, nTerm, &rc); + + if( rc==SQLITE_OK ){ + if( pNode->key.n ){ + pBlk->n += sqlite3Fts3PutVarint(&pBlk->a[pBlk->n], nPrefix); + } + pBlk->n += sqlite3Fts3PutVarint(&pBlk->a[pBlk->n], nSuffix); + memcpy(&pBlk->a[pBlk->n], &zTerm[nPrefix], nSuffix); + pBlk->n += nSuffix; + + memcpy(pNode->key.a, zTerm, nTerm); + pNode->key.n = nTerm; + } + }else{ + /* Otherwise, flush the current node of layer iLayer to disk. + ** Then allocate a new, empty sibling node. The key will be written + ** into the parent of this node. */ + rc = fts3WriteSegment(p, pNode->iBlock, pNode->block.a, pNode->block.n); + + assert( pNode->block.nAlloc>=p->nNodeSize ); + pNode->block.a[0] = (char)iLayer; + pNode->block.n = 1 + sqlite3Fts3PutVarint(&pNode->block.a[1], iPtr+1); + + iNextPtr = pNode->iBlock; + pNode->iBlock++; + pNode->key.n = 0; + } + + if( rc!=SQLITE_OK || iNextPtr==0 ) return rc; + iPtr = iNextPtr; + } + + assert( 0 ); + return 0; +} + +/* +** Append a term and (optionally) doclist to the FTS segment node currently +** stored in blob *pNode. The node need not contain any terms, but the +** header must be written before this function is called. +** +** A node header is a single 0x00 byte for a leaf node, or a height varint +** followed by the left-hand-child varint for an internal node. +** +** The term to be appended is passed via arguments zTerm/nTerm. For a +** leaf node, the doclist is passed as aDoclist/nDoclist. For an internal +** node, both aDoclist and nDoclist must be passed 0. +** +** If the size of the value in blob pPrev is zero, then this is the first +** term written to the node. Otherwise, pPrev contains a copy of the +** previous term. Before this function returns, it is updated to contain a +** copy of zTerm/nTerm. +** +** It is assumed that the buffer associated with pNode is already large +** enough to accommodate the new entry. The buffer associated with pPrev +** is extended by this function if requrired. +** +** If an error (i.e. OOM condition) occurs, an SQLite error code is +** returned. Otherwise, SQLITE_OK. +*/ +static int fts3AppendToNode( + Blob *pNode, /* Current node image to append to */ + Blob *pPrev, /* Buffer containing previous term written */ + const char *zTerm, /* New term to write */ + int nTerm, /* Size of zTerm in bytes */ + const char *aDoclist, /* Doclist (or NULL) to write */ + int nDoclist /* Size of aDoclist in bytes */ +){ + int rc = SQLITE_OK; /* Return code */ + int bFirst = (pPrev->n==0); /* True if this is the first term written */ + int nPrefix; /* Size of term prefix in bytes */ + int nSuffix; /* Size of term suffix in bytes */ + + /* Node must have already been started. There must be a doclist for a + ** leaf node, and there must not be a doclist for an internal node. */ + assert( pNode->n>0 ); + assert( (pNode->a[0]=='\0')==(aDoclist!=0) ); + + blobGrowBuffer(pPrev, nTerm, &rc); + if( rc!=SQLITE_OK ) return rc; + + nPrefix = fts3PrefixCompress(pPrev->a, pPrev->n, zTerm, nTerm); + nSuffix = nTerm - nPrefix; + memcpy(pPrev->a, zTerm, nTerm); + pPrev->n = nTerm; + + if( bFirst==0 ){ + pNode->n += sqlite3Fts3PutVarint(&pNode->a[pNode->n], nPrefix); + } + pNode->n += sqlite3Fts3PutVarint(&pNode->a[pNode->n], nSuffix); + memcpy(&pNode->a[pNode->n], &zTerm[nPrefix], nSuffix); + pNode->n += nSuffix; + + if( aDoclist ){ + pNode->n += sqlite3Fts3PutVarint(&pNode->a[pNode->n], nDoclist); + memcpy(&pNode->a[pNode->n], aDoclist, nDoclist); + pNode->n += nDoclist; + } + + assert( pNode->n<=pNode->nAlloc ); + + return SQLITE_OK; +} + +/* +** Append the current term and doclist pointed to by cursor pCsr to the +** appendable b-tree segment opened for writing by pWriter. +** +** Return SQLITE_OK if successful, or an SQLite error code otherwise. +*/ +static int fts3IncrmergeAppend( + Fts3Table *p, /* Fts3 table handle */ + IncrmergeWriter *pWriter, /* Writer object */ + Fts3MultiSegReader *pCsr /* Cursor containing term and doclist */ +){ + const char *zTerm = pCsr->zTerm; + int nTerm = pCsr->nTerm; + const char *aDoclist = pCsr->aDoclist; + int nDoclist = pCsr->nDoclist; + int rc = SQLITE_OK; /* Return code */ + int nSpace; /* Total space in bytes required on leaf */ + int nPrefix; /* Size of prefix shared with previous term */ + int nSuffix; /* Size of suffix (nTerm - nPrefix) */ + NodeWriter *pLeaf; /* Object used to write leaf nodes */ + + pLeaf = &pWriter->aNodeWriter[0]; + nPrefix = fts3PrefixCompress(pLeaf->key.a, pLeaf->key.n, zTerm, nTerm); + nSuffix = nTerm - nPrefix; + + nSpace = sqlite3Fts3VarintLen(nPrefix); + nSpace += sqlite3Fts3VarintLen(nSuffix) + nSuffix; + nSpace += sqlite3Fts3VarintLen(nDoclist) + nDoclist; + + /* If the current block is not empty, and if adding this term/doclist + ** to the current block would make it larger than Fts3Table.nNodeSize + ** bytes, write this block out to the database. */ + if( pLeaf->block.n>0 && (pLeaf->block.n + nSpace)>p->nNodeSize ){ + rc = fts3WriteSegment(p, pLeaf->iBlock, pLeaf->block.a, pLeaf->block.n); + pWriter->nWork++; + + /* Add the current term to the parent node. The term added to the + ** parent must: + ** + ** a) be greater than the largest term on the leaf node just written + ** to the database (still available in pLeaf->key), and + ** + ** b) be less than or equal to the term about to be added to the new + ** leaf node (zTerm/nTerm). + ** + ** In other words, it must be the prefix of zTerm 1 byte longer than + ** the common prefix (if any) of zTerm and pWriter->zTerm. + */ + if( rc==SQLITE_OK ){ + rc = fts3IncrmergePush(p, pWriter, zTerm, nPrefix+1); + } + + /* Advance to the next output block */ + pLeaf->iBlock++; + pLeaf->key.n = 0; + pLeaf->block.n = 0; + + nSuffix = nTerm; + nSpace = 1; + nSpace += sqlite3Fts3VarintLen(nSuffix) + nSuffix; + nSpace += sqlite3Fts3VarintLen(nDoclist) + nDoclist; + } + + blobGrowBuffer(&pLeaf->block, pLeaf->block.n + nSpace, &rc); + + if( rc==SQLITE_OK ){ + if( pLeaf->block.n==0 ){ + pLeaf->block.n = 1; + pLeaf->block.a[0] = '\0'; + } + rc = fts3AppendToNode( + &pLeaf->block, &pLeaf->key, zTerm, nTerm, aDoclist, nDoclist + ); + } + + return rc; +} + +/* +** This function is called to release all dynamic resources held by the +** merge-writer object pWriter, and if no error has occurred, to flush +** all outstanding node buffers held by pWriter to disk. +** +** If *pRc is not SQLITE_OK when this function is called, then no attempt +** is made to write any data to disk. Instead, this function serves only +** to release outstanding resources. +** +** Otherwise, if *pRc is initially SQLITE_OK and an error occurs while +** flushing buffers to disk, *pRc is set to an SQLite error code before +** returning. +*/ +static void fts3IncrmergeRelease( + Fts3Table *p, /* FTS3 table handle */ + IncrmergeWriter *pWriter, /* Merge-writer object */ + int *pRc /* IN/OUT: Error code */ +){ + int i; /* Used to iterate through non-root layers */ + int iRoot; /* Index of root in pWriter->aNodeWriter */ + NodeWriter *pRoot; /* NodeWriter for root node */ + int rc = *pRc; /* Error code */ + + /* Set iRoot to the index in pWriter->aNodeWriter[] of the output segment + ** root node. If the segment fits entirely on a single leaf node, iRoot + ** will be set to 0. If the root node is the parent of the leaves, iRoot + ** will be 1. And so on. */ + for(iRoot=FTS_MAX_APPENDABLE_HEIGHT-1; iRoot>=0; iRoot--){ + NodeWriter *pNode = &pWriter->aNodeWriter[iRoot]; + if( pNode->block.n>0 ) break; + assert( *pRc || pNode->block.nAlloc==0 ); + assert( *pRc || pNode->key.nAlloc==0 ); + sqlite3_free(pNode->block.a); + sqlite3_free(pNode->key.a); + } + + /* Empty output segment. This is a no-op. */ + if( iRoot<0 ) return; + + /* The entire output segment fits on a single node. Normally, this means + ** the node would be stored as a blob in the "root" column of the %_segdir + ** table. However, this is not permitted in this case. The problem is that + ** space has already been reserved in the %_segments table, and so the + ** start_block and end_block fields of the %_segdir table must be populated. + ** And, by design or by accident, released versions of FTS cannot handle + ** segments that fit entirely on the root node with start_block!=0. + ** + ** Instead, create a synthetic root node that contains nothing but a + ** pointer to the single content node. So that the segment consists of a + ** single leaf and a single interior (root) node. + ** + ** Todo: Better might be to defer allocating space in the %_segments + ** table until we are sure it is needed. + */ + if( iRoot==0 ){ + Blob *pBlock = &pWriter->aNodeWriter[1].block; + blobGrowBuffer(pBlock, 1 + FTS3_VARINT_MAX, &rc); + if( rc==SQLITE_OK ){ + pBlock->a[0] = 0x01; + pBlock->n = 1 + sqlite3Fts3PutVarint( + &pBlock->a[1], pWriter->aNodeWriter[0].iBlock + ); + } + iRoot = 1; + } + pRoot = &pWriter->aNodeWriter[iRoot]; + + /* Flush all currently outstanding nodes to disk. */ + for(i=0; i<iRoot; i++){ + NodeWriter *pNode = &pWriter->aNodeWriter[i]; + if( pNode->block.n>0 && rc==SQLITE_OK ){ + rc = fts3WriteSegment(p, pNode->iBlock, pNode->block.a, pNode->block.n); + } + sqlite3_free(pNode->block.a); + sqlite3_free(pNode->key.a); + } + + /* Write the %_segdir record. */ + if( rc==SQLITE_OK ){ + rc = fts3WriteSegdir(p, + pWriter->iAbsLevel+1, /* level */ + pWriter->iIdx, /* idx */ + pWriter->iStart, /* start_block */ + pWriter->aNodeWriter[0].iBlock, /* leaves_end_block */ + pWriter->iEnd, /* end_block */ + pRoot->block.a, pRoot->block.n /* root */ + ); + } + sqlite3_free(pRoot->block.a); + sqlite3_free(pRoot->key.a); + + *pRc = rc; +} + +/* +** Compare the term in buffer zLhs (size in bytes nLhs) with that in +** zRhs (size in bytes nRhs) using memcmp. If one term is a prefix of +** the other, it is considered to be smaller than the other. +** +** Return -ve if zLhs is smaller than zRhs, 0 if it is equal, or +ve +** if it is greater. +*/ +static int fts3TermCmp( + const char *zLhs, int nLhs, /* LHS of comparison */ + const char *zRhs, int nRhs /* RHS of comparison */ +){ + int nCmp = MIN(nLhs, nRhs); + int res; + + res = memcmp(zLhs, zRhs, nCmp); + if( res==0 ) res = nLhs - nRhs; + + return res; +} + + +/* +** Query to see if the entry in the %_segments table with blockid iEnd is +** NULL. If no error occurs and the entry is NULL, set *pbRes 1 before +** returning. Otherwise, set *pbRes to 0. +** +** Or, if an error occurs while querying the database, return an SQLite +** error code. The final value of *pbRes is undefined in this case. +** +** This is used to test if a segment is an "appendable" segment. If it +** is, then a NULL entry has been inserted into the %_segments table +** with blockid %_segdir.end_block. +*/ +static int fts3IsAppendable(Fts3Table *p, sqlite3_int64 iEnd, int *pbRes){ + int bRes = 0; /* Result to set *pbRes to */ + sqlite3_stmt *pCheck = 0; /* Statement to query database with */ + int rc; /* Return code */ + + rc = fts3SqlStmt(p, SQL_SEGMENT_IS_APPENDABLE, &pCheck, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pCheck, 1, iEnd); + if( SQLITE_ROW==sqlite3_step(pCheck) ) bRes = 1; + rc = sqlite3_reset(pCheck); + } + + *pbRes = bRes; + return rc; +} + +/* +** This function is called when initializing an incremental-merge operation. +** It checks if the existing segment with index value iIdx at absolute level +** (iAbsLevel+1) can be appended to by the incremental merge. If it can, the +** merge-writer object *pWriter is initialized to write to it. +** +** An existing segment can be appended to by an incremental merge if: +** +** * It was initially created as an appendable segment (with all required +** space pre-allocated), and +** +** * The first key read from the input (arguments zKey and nKey) is +** greater than the largest key currently stored in the potential +** output segment. +*/ +static int fts3IncrmergeLoad( + Fts3Table *p, /* Fts3 table handle */ + sqlite3_int64 iAbsLevel, /* Absolute level of input segments */ + int iIdx, /* Index of candidate output segment */ + const char *zKey, /* First key to write */ + int nKey, /* Number of bytes in nKey */ + IncrmergeWriter *pWriter /* Populate this object */ +){ + int rc; /* Return code */ + sqlite3_stmt *pSelect = 0; /* SELECT to read %_segdir entry */ + + rc = fts3SqlStmt(p, SQL_SELECT_SEGDIR, &pSelect, 0); + if( rc==SQLITE_OK ){ + sqlite3_int64 iStart = 0; /* Value of %_segdir.start_block */ + sqlite3_int64 iLeafEnd = 0; /* Value of %_segdir.leaves_end_block */ + sqlite3_int64 iEnd = 0; /* Value of %_segdir.end_block */ + const char *aRoot = 0; /* Pointer to %_segdir.root buffer */ + int nRoot = 0; /* Size of aRoot[] in bytes */ + int rc2; /* Return code from sqlite3_reset() */ + int bAppendable = 0; /* Set to true if segment is appendable */ + + /* Read the %_segdir entry for index iIdx absolute level (iAbsLevel+1) */ + sqlite3_bind_int64(pSelect, 1, iAbsLevel+1); + sqlite3_bind_int(pSelect, 2, iIdx); + if( sqlite3_step(pSelect)==SQLITE_ROW ){ + iStart = sqlite3_column_int64(pSelect, 1); + iLeafEnd = sqlite3_column_int64(pSelect, 2); + iEnd = sqlite3_column_int64(pSelect, 3); + nRoot = sqlite3_column_bytes(pSelect, 4); + aRoot = sqlite3_column_blob(pSelect, 4); + }else{ + return sqlite3_reset(pSelect); + } + + /* Check for the zero-length marker in the %_segments table */ + rc = fts3IsAppendable(p, iEnd, &bAppendable); + + /* Check that zKey/nKey is larger than the largest key the candidate */ + if( rc==SQLITE_OK && bAppendable ){ + char *aLeaf = 0; + int nLeaf = 0; + + rc = sqlite3Fts3ReadBlock(p, iLeafEnd, &aLeaf, &nLeaf, 0); + if( rc==SQLITE_OK ){ + NodeReader reader; + for(rc = nodeReaderInit(&reader, aLeaf, nLeaf); + rc==SQLITE_OK && reader.aNode; + rc = nodeReaderNext(&reader) + ){ + assert( reader.aNode ); + } + if( fts3TermCmp(zKey, nKey, reader.term.a, reader.term.n)<=0 ){ + bAppendable = 0; + } + nodeReaderRelease(&reader); + } + sqlite3_free(aLeaf); + } + + if( rc==SQLITE_OK && bAppendable ){ + /* It is possible to append to this segment. Set up the IncrmergeWriter + ** object to do so. */ + int i; + int nHeight = (int)aRoot[0]; + NodeWriter *pNode; + + pWriter->nLeafEst = (int)((iEnd - iStart) + 1)/FTS_MAX_APPENDABLE_HEIGHT; + pWriter->iStart = iStart; + pWriter->iEnd = iEnd; + pWriter->iAbsLevel = iAbsLevel; + pWriter->iIdx = iIdx; + + for(i=nHeight+1; i<FTS_MAX_APPENDABLE_HEIGHT; i++){ + pWriter->aNodeWriter[i].iBlock = pWriter->iStart + i*pWriter->nLeafEst; + } + + pNode = &pWriter->aNodeWriter[nHeight]; + pNode->iBlock = pWriter->iStart + pWriter->nLeafEst*nHeight; + blobGrowBuffer(&pNode->block, MAX(nRoot, p->nNodeSize), &rc); + if( rc==SQLITE_OK ){ + memcpy(pNode->block.a, aRoot, nRoot); + pNode->block.n = nRoot; + } + + for(i=nHeight; i>=0 && rc==SQLITE_OK; i--){ + NodeReader reader; + pNode = &pWriter->aNodeWriter[i]; + + rc = nodeReaderInit(&reader, pNode->block.a, pNode->block.n); + while( reader.aNode && rc==SQLITE_OK ) rc = nodeReaderNext(&reader); + blobGrowBuffer(&pNode->key, reader.term.n, &rc); + if( rc==SQLITE_OK ){ + memcpy(pNode->key.a, reader.term.a, reader.term.n); + pNode->key.n = reader.term.n; + if( i>0 ){ + char *aBlock = 0; + int nBlock = 0; + pNode = &pWriter->aNodeWriter[i-1]; + pNode->iBlock = reader.iChild; + rc = sqlite3Fts3ReadBlock(p, reader.iChild, &aBlock, &nBlock, 0); + blobGrowBuffer(&pNode->block, MAX(nBlock, p->nNodeSize), &rc); + if( rc==SQLITE_OK ){ + memcpy(pNode->block.a, aBlock, nBlock); + pNode->block.n = nBlock; + } + sqlite3_free(aBlock); + } + } + nodeReaderRelease(&reader); + } + } + + rc2 = sqlite3_reset(pSelect); + if( rc==SQLITE_OK ) rc = rc2; + } + + return rc; +} + +/* +** Determine the largest segment index value that exists within absolute +** level iAbsLevel+1. If no error occurs, set *piIdx to this value plus +** one before returning SQLITE_OK. Or, if there are no segments at all +** within level iAbsLevel, set *piIdx to zero. +** +** If an error occurs, return an SQLite error code. The final value of +** *piIdx is undefined in this case. +*/ +static int fts3IncrmergeOutputIdx( + Fts3Table *p, /* FTS Table handle */ + sqlite3_int64 iAbsLevel, /* Absolute index of input segments */ + int *piIdx /* OUT: Next free index at iAbsLevel+1 */ +){ + int rc; + sqlite3_stmt *pOutputIdx = 0; /* SQL used to find output index */ + + rc = fts3SqlStmt(p, SQL_NEXT_SEGMENT_INDEX, &pOutputIdx, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pOutputIdx, 1, iAbsLevel+1); + sqlite3_step(pOutputIdx); + *piIdx = sqlite3_column_int(pOutputIdx, 0); + rc = sqlite3_reset(pOutputIdx); + } + + return rc; +} + +/* +** Allocate an appendable output segment on absolute level iAbsLevel+1 +** with idx value iIdx. +** +** In the %_segdir table, a segment is defined by the values in three +** columns: +** +** start_block +** leaves_end_block +** end_block +** +** When an appendable segment is allocated, it is estimated that the +** maximum number of leaf blocks that may be required is the sum of the +** number of leaf blocks consumed by the input segments, plus the number +** of input segments, multiplied by two. This value is stored in stack +** variable nLeafEst. +** +** A total of 16*nLeafEst blocks are allocated when an appendable segment +** is created ((1 + end_block - start_block)==16*nLeafEst). The contiguous +** array of leaf nodes starts at the first block allocated. The array +** of interior nodes that are parents of the leaf nodes start at block +** (start_block + (1 + end_block - start_block) / 16). And so on. +** +** In the actual code below, the value "16" is replaced with the +** pre-processor macro FTS_MAX_APPENDABLE_HEIGHT. +*/ +static int fts3IncrmergeWriter( + Fts3Table *p, /* Fts3 table handle */ + sqlite3_int64 iAbsLevel, /* Absolute level of input segments */ + int iIdx, /* Index of new output segment */ + Fts3MultiSegReader *pCsr, /* Cursor that data will be read from */ + IncrmergeWriter *pWriter /* Populate this object */ +){ + int rc; /* Return Code */ + int i; /* Iterator variable */ + int nLeafEst = 0; /* Blocks allocated for leaf nodes */ + sqlite3_stmt *pLeafEst = 0; /* SQL used to determine nLeafEst */ + sqlite3_stmt *pFirstBlock = 0; /* SQL used to determine first block */ + + /* Calculate nLeafEst. */ + rc = fts3SqlStmt(p, SQL_MAX_LEAF_NODE_ESTIMATE, &pLeafEst, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pLeafEst, 1, iAbsLevel); + sqlite3_bind_int64(pLeafEst, 2, pCsr->nSegment); + if( SQLITE_ROW==sqlite3_step(pLeafEst) ){ + nLeafEst = sqlite3_column_int(pLeafEst, 0); + } + rc = sqlite3_reset(pLeafEst); + } + if( rc!=SQLITE_OK ) return rc; + + /* Calculate the first block to use in the output segment */ + rc = fts3SqlStmt(p, SQL_NEXT_SEGMENTS_ID, &pFirstBlock, 0); + if( rc==SQLITE_OK ){ + if( SQLITE_ROW==sqlite3_step(pFirstBlock) ){ + pWriter->iStart = sqlite3_column_int64(pFirstBlock, 0); + pWriter->iEnd = pWriter->iStart - 1; + pWriter->iEnd += nLeafEst * FTS_MAX_APPENDABLE_HEIGHT; + } + rc = sqlite3_reset(pFirstBlock); + } + if( rc!=SQLITE_OK ) return rc; + + /* Insert the marker in the %_segments table to make sure nobody tries + ** to steal the space just allocated. This is also used to identify + ** appendable segments. */ + rc = fts3WriteSegment(p, pWriter->iEnd, 0, 0); + if( rc!=SQLITE_OK ) return rc; + + pWriter->iAbsLevel = iAbsLevel; + pWriter->nLeafEst = nLeafEst; + pWriter->iIdx = iIdx; + + /* Set up the array of NodeWriter objects */ + for(i=0; i<FTS_MAX_APPENDABLE_HEIGHT; i++){ + pWriter->aNodeWriter[i].iBlock = pWriter->iStart + i*pWriter->nLeafEst; + } + return SQLITE_OK; +} + +/* +** Remove an entry from the %_segdir table. This involves running the +** following two statements: +** +** DELETE FROM %_segdir WHERE level = :iAbsLevel AND idx = :iIdx +** UPDATE %_segdir SET idx = idx - 1 WHERE level = :iAbsLevel AND idx > :iIdx +** +** The DELETE statement removes the specific %_segdir level. The UPDATE +** statement ensures that the remaining segments have contiguously allocated +** idx values. +*/ +static int fts3RemoveSegdirEntry( + Fts3Table *p, /* FTS3 table handle */ + sqlite3_int64 iAbsLevel, /* Absolute level to delete from */ + int iIdx /* Index of %_segdir entry to delete */ +){ + int rc; /* Return code */ + sqlite3_stmt *pDelete = 0; /* DELETE statement */ + + rc = fts3SqlStmt(p, SQL_DELETE_SEGDIR_ENTRY, &pDelete, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pDelete, 1, iAbsLevel); + sqlite3_bind_int(pDelete, 2, iIdx); + sqlite3_step(pDelete); + rc = sqlite3_reset(pDelete); + } + + return rc; +} + +/* +** One or more segments have just been removed from absolute level iAbsLevel. +** Update the 'idx' values of the remaining segments in the level so that +** the idx values are a contiguous sequence starting from 0. +*/ +static int fts3RepackSegdirLevel( + Fts3Table *p, /* FTS3 table handle */ + sqlite3_int64 iAbsLevel /* Absolute level to repack */ +){ + int rc; /* Return code */ + int *aIdx = 0; /* Array of remaining idx values */ + int nIdx = 0; /* Valid entries in aIdx[] */ + int nAlloc = 0; /* Allocated size of aIdx[] */ + int i; /* Iterator variable */ + sqlite3_stmt *pSelect = 0; /* Select statement to read idx values */ + sqlite3_stmt *pUpdate = 0; /* Update statement to modify idx values */ + + rc = fts3SqlStmt(p, SQL_SELECT_INDEXES, &pSelect, 0); + if( rc==SQLITE_OK ){ + int rc2; + sqlite3_bind_int64(pSelect, 1, iAbsLevel); + while( SQLITE_ROW==sqlite3_step(pSelect) ){ + if( nIdx>=nAlloc ){ + int *aNew; + nAlloc += 16; + aNew = sqlite3_realloc(aIdx, nAlloc*sizeof(int)); + if( !aNew ){ + rc = SQLITE_NOMEM; + break; + } + aIdx = aNew; + } + aIdx[nIdx++] = sqlite3_column_int(pSelect, 0); + } + rc2 = sqlite3_reset(pSelect); + if( rc==SQLITE_OK ) rc = rc2; + } + + if( rc==SQLITE_OK ){ + rc = fts3SqlStmt(p, SQL_SHIFT_SEGDIR_ENTRY, &pUpdate, 0); + } + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pUpdate, 2, iAbsLevel); + } + + assert( p->bIgnoreSavepoint==0 ); + p->bIgnoreSavepoint = 1; + for(i=0; rc==SQLITE_OK && i<nIdx; i++){ + if( aIdx[i]!=i ){ + sqlite3_bind_int(pUpdate, 3, aIdx[i]); + sqlite3_bind_int(pUpdate, 1, i); + sqlite3_step(pUpdate); + rc = sqlite3_reset(pUpdate); + } + } + p->bIgnoreSavepoint = 0; + + sqlite3_free(aIdx); + return rc; +} + +static void fts3StartNode(Blob *pNode, int iHeight, sqlite3_int64 iChild){ + pNode->a[0] = (char)iHeight; + if( iChild ){ + assert( pNode->nAlloc>=1+sqlite3Fts3VarintLen(iChild) ); + pNode->n = 1 + sqlite3Fts3PutVarint(&pNode->a[1], iChild); + }else{ + assert( pNode->nAlloc>=1 ); + pNode->n = 1; + } +} + +/* +** The first two arguments are a pointer to and the size of a segment b-tree +** node. The node may be a leaf or an internal node. +** +** This function creates a new node image in blob object *pNew by copying +** all terms that are greater than or equal to zTerm/nTerm (for leaf nodes) +** or greater than zTerm/nTerm (for internal nodes) from aNode/nNode. +*/ +static int fts3TruncateNode( + const char *aNode, /* Current node image */ + int nNode, /* Size of aNode in bytes */ + Blob *pNew, /* OUT: Write new node image here */ + const char *zTerm, /* Omit all terms smaller than this */ + int nTerm, /* Size of zTerm in bytes */ + sqlite3_int64 *piBlock /* OUT: Block number in next layer down */ +){ + NodeReader reader; /* Reader object */ + Blob prev = {0, 0, 0}; /* Previous term written to new node */ + int rc = SQLITE_OK; /* Return code */ + int bLeaf = aNode[0]=='\0'; /* True for a leaf node */ + + /* Allocate required output space */ + blobGrowBuffer(pNew, nNode, &rc); + if( rc!=SQLITE_OK ) return rc; + pNew->n = 0; + + /* Populate new node buffer */ + for(rc = nodeReaderInit(&reader, aNode, nNode); + rc==SQLITE_OK && reader.aNode; + rc = nodeReaderNext(&reader) + ){ + if( pNew->n==0 ){ + int res = fts3TermCmp(reader.term.a, reader.term.n, zTerm, nTerm); + if( res<0 || (bLeaf==0 && res==0) ) continue; + fts3StartNode(pNew, (int)aNode[0], reader.iChild); + *piBlock = reader.iChild; + } + rc = fts3AppendToNode( + pNew, &prev, reader.term.a, reader.term.n, + reader.aDoclist, reader.nDoclist + ); + if( rc!=SQLITE_OK ) break; + } + if( pNew->n==0 ){ + fts3StartNode(pNew, (int)aNode[0], reader.iChild); + *piBlock = reader.iChild; + } + assert( pNew->n<=pNew->nAlloc ); + + nodeReaderRelease(&reader); + sqlite3_free(prev.a); + return rc; +} + +/* +** Remove all terms smaller than zTerm/nTerm from segment iIdx in absolute +** level iAbsLevel. This may involve deleting entries from the %_segments +** table, and modifying existing entries in both the %_segments and %_segdir +** tables. +** +** SQLITE_OK is returned if the segment is updated successfully. Or an +** SQLite error code otherwise. +*/ +static int fts3TruncateSegment( + Fts3Table *p, /* FTS3 table handle */ + sqlite3_int64 iAbsLevel, /* Absolute level of segment to modify */ + int iIdx, /* Index within level of segment to modify */ + const char *zTerm, /* Remove terms smaller than this */ + int nTerm /* Number of bytes in buffer zTerm */ +){ + int rc = SQLITE_OK; /* Return code */ + Blob root = {0,0,0}; /* New root page image */ + Blob block = {0,0,0}; /* Buffer used for any other block */ + sqlite3_int64 iBlock = 0; /* Block id */ + sqlite3_int64 iNewStart = 0; /* New value for iStartBlock */ + sqlite3_int64 iOldStart = 0; /* Old value for iStartBlock */ + sqlite3_stmt *pFetch = 0; /* Statement used to fetch segdir */ + + rc = fts3SqlStmt(p, SQL_SELECT_SEGDIR, &pFetch, 0); + if( rc==SQLITE_OK ){ + int rc2; /* sqlite3_reset() return code */ + sqlite3_bind_int64(pFetch, 1, iAbsLevel); + sqlite3_bind_int(pFetch, 2, iIdx); + if( SQLITE_ROW==sqlite3_step(pFetch) ){ + const char *aRoot = sqlite3_column_blob(pFetch, 4); + int nRoot = sqlite3_column_bytes(pFetch, 4); + iOldStart = sqlite3_column_int64(pFetch, 1); + rc = fts3TruncateNode(aRoot, nRoot, &root, zTerm, nTerm, &iBlock); + } + rc2 = sqlite3_reset(pFetch); + if( rc==SQLITE_OK ) rc = rc2; + } + + while( rc==SQLITE_OK && iBlock ){ + char *aBlock = 0; + int nBlock = 0; + iNewStart = iBlock; + + rc = sqlite3Fts3ReadBlock(p, iBlock, &aBlock, &nBlock, 0); + if( rc==SQLITE_OK ){ + rc = fts3TruncateNode(aBlock, nBlock, &block, zTerm, nTerm, &iBlock); + } + if( rc==SQLITE_OK ){ + rc = fts3WriteSegment(p, iNewStart, block.a, block.n); + } + sqlite3_free(aBlock); + } + + /* Variable iNewStart now contains the first valid leaf node. */ + if( rc==SQLITE_OK && iNewStart ){ + sqlite3_stmt *pDel = 0; + rc = fts3SqlStmt(p, SQL_DELETE_SEGMENTS_RANGE, &pDel, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pDel, 1, iOldStart); + sqlite3_bind_int64(pDel, 2, iNewStart-1); + sqlite3_step(pDel); + rc = sqlite3_reset(pDel); + } + } + + if( rc==SQLITE_OK ){ + sqlite3_stmt *pChomp = 0; + rc = fts3SqlStmt(p, SQL_CHOMP_SEGDIR, &pChomp, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int64(pChomp, 1, iNewStart); + sqlite3_bind_blob(pChomp, 2, root.a, root.n, SQLITE_STATIC); + sqlite3_bind_int64(pChomp, 3, iAbsLevel); + sqlite3_bind_int(pChomp, 4, iIdx); + sqlite3_step(pChomp); + rc = sqlite3_reset(pChomp); + } + } + + sqlite3_free(root.a); + sqlite3_free(block.a); + return rc; +} + +/* +** This function is called after an incrmental-merge operation has run to +** merge (or partially merge) two or more segments from absolute level +** iAbsLevel. +** +** Each input segment is either removed from the db completely (if all of +** its data was copied to the output segment by the incrmerge operation) +** or modified in place so that it no longer contains those entries that +** have been duplicated in the output segment. +*/ +static int fts3IncrmergeChomp( + Fts3Table *p, /* FTS table handle */ + sqlite3_int64 iAbsLevel, /* Absolute level containing segments */ + Fts3MultiSegReader *pCsr, /* Chomp all segments opened by this cursor */ + int *pnRem /* Number of segments not deleted */ +){ + int i; + int nRem = 0; + int rc = SQLITE_OK; + + for(i=pCsr->nSegment-1; i>=0 && rc==SQLITE_OK; i--){ + Fts3SegReader *pSeg = 0; + int j; + + /* Find the Fts3SegReader object with Fts3SegReader.iIdx==i. It is hiding + ** somewhere in the pCsr->apSegment[] array. */ + for(j=0; ALWAYS(j<pCsr->nSegment); j++){ + pSeg = pCsr->apSegment[j]; + if( pSeg->iIdx==i ) break; + } + assert( j<pCsr->nSegment && pSeg->iIdx==i ); + + if( pSeg->aNode==0 ){ + /* Seg-reader is at EOF. Remove the entire input segment. */ + rc = fts3DeleteSegment(p, pSeg); + if( rc==SQLITE_OK ){ + rc = fts3RemoveSegdirEntry(p, iAbsLevel, pSeg->iIdx); + } + *pnRem = 0; + }else{ + /* The incremental merge did not copy all the data from this + ** segment to the upper level. The segment is modified in place + ** so that it contains no keys smaller than zTerm/nTerm. */ + const char *zTerm = pSeg->zTerm; + int nTerm = pSeg->nTerm; + rc = fts3TruncateSegment(p, iAbsLevel, pSeg->iIdx, zTerm, nTerm); + nRem++; + } + } + + if( rc==SQLITE_OK && nRem!=pCsr->nSegment ){ + rc = fts3RepackSegdirLevel(p, iAbsLevel); + } + + *pnRem = nRem; + return rc; +} + +/* +** Store an incr-merge hint in the database. +*/ +static int fts3IncrmergeHintStore(Fts3Table *p, Blob *pHint){ + sqlite3_stmt *pReplace = 0; + int rc; /* Return code */ + + rc = fts3SqlStmt(p, SQL_REPLACE_STAT, &pReplace, 0); + if( rc==SQLITE_OK ){ + sqlite3_bind_int(pReplace, 1, FTS_STAT_INCRMERGEHINT); + sqlite3_bind_blob(pReplace, 2, pHint->a, pHint->n, SQLITE_STATIC); + sqlite3_step(pReplace); + rc = sqlite3_reset(pReplace); + } + + return rc; +} + +/* +** Load an incr-merge hint from the database. The incr-merge hint, if one +** exists, is stored in the rowid==1 row of the %_stat table. +** +** If successful, populate blob *pHint with the value read from the %_stat +** table and return SQLITE_OK. Otherwise, if an error occurs, return an +** SQLite error code. +*/ +static int fts3IncrmergeHintLoad(Fts3Table *p, Blob *pHint){ + sqlite3_stmt *pSelect = 0; + int rc; + + pHint->n = 0; + rc = fts3SqlStmt(p, SQL_SELECT_STAT, &pSelect, 0); + if( rc==SQLITE_OK ){ + int rc2; + sqlite3_bind_int(pSelect, 1, FTS_STAT_INCRMERGEHINT); + if( SQLITE_ROW==sqlite3_step(pSelect) ){ + const char *aHint = sqlite3_column_blob(pSelect, 0); + int nHint = sqlite3_column_bytes(pSelect, 0); + if( aHint ){ + blobGrowBuffer(pHint, nHint, &rc); + if( rc==SQLITE_OK ){ + memcpy(pHint->a, aHint, nHint); + pHint->n = nHint; + } + } + } + rc2 = sqlite3_reset(pSelect); + if( rc==SQLITE_OK ) rc = rc2; + } + + return rc; +} + +/* +** If *pRc is not SQLITE_OK when this function is called, it is a no-op. +** Otherwise, append an entry to the hint stored in blob *pHint. Each entry +** consists of two varints, the absolute level number of the input segments +** and the number of input segments. +** +** If successful, leave *pRc set to SQLITE_OK and return. If an error occurs, +** set *pRc to an SQLite error code before returning. +*/ +static void fts3IncrmergeHintPush( + Blob *pHint, /* Hint blob to append to */ + i64 iAbsLevel, /* First varint to store in hint */ + int nInput, /* Second varint to store in hint */ + int *pRc /* IN/OUT: Error code */ +){ + blobGrowBuffer(pHint, pHint->n + 2*FTS3_VARINT_MAX, pRc); + if( *pRc==SQLITE_OK ){ + pHint->n += sqlite3Fts3PutVarint(&pHint->a[pHint->n], iAbsLevel); + pHint->n += sqlite3Fts3PutVarint(&pHint->a[pHint->n], (i64)nInput); + } +} + +/* +** Read the last entry (most recently pushed) from the hint blob *pHint +** and then remove the entry. Write the two values read to *piAbsLevel and +** *pnInput before returning. +** +** If no error occurs, return SQLITE_OK. If the hint blob in *pHint does +** not contain at least two valid varints, return SQLITE_CORRUPT_VTAB. +*/ +static int fts3IncrmergeHintPop(Blob *pHint, i64 *piAbsLevel, int *pnInput){ + const int nHint = pHint->n; + int i; + + i = pHint->n-2; + while( i>0 && (pHint->a[i-1] & 0x80) ) i--; + while( i>0 && (pHint->a[i-1] & 0x80) ) i--; + + pHint->n = i; + i += sqlite3Fts3GetVarint(&pHint->a[i], piAbsLevel); + i += sqlite3Fts3GetVarint32(&pHint->a[i], pnInput); + if( i!=nHint ) return SQLITE_CORRUPT_VTAB; + + return SQLITE_OK; +} + + +/* +** Attempt an incremental merge that writes nMerge leaf blocks. +** +** Incremental merges happen nMin segments at a time. The two +** segments to be merged are the nMin oldest segments (the ones with +** the smallest indexes) in the highest level that contains at least +** nMin segments. Multiple merges might occur in an attempt to write the +** quota of nMerge leaf blocks. +*/ +int sqlite3Fts3Incrmerge(Fts3Table *p, int nMerge, int nMin){ + int rc; /* Return code */ + int nRem = nMerge; /* Number of leaf pages yet to be written */ + Fts3MultiSegReader *pCsr; /* Cursor used to read input data */ + Fts3SegFilter *pFilter; /* Filter used with cursor pCsr */ + IncrmergeWriter *pWriter; /* Writer object */ + int nSeg = 0; /* Number of input segments */ + sqlite3_int64 iAbsLevel = 0; /* Absolute level number to work on */ + Blob hint = {0, 0, 0}; /* Hint read from %_stat table */ + int bDirtyHint = 0; /* True if blob 'hint' has been modified */ + + /* Allocate space for the cursor, filter and writer objects */ + const int nAlloc = sizeof(*pCsr) + sizeof(*pFilter) + sizeof(*pWriter); + pWriter = (IncrmergeWriter *)sqlite3_malloc(nAlloc); + if( !pWriter ) return SQLITE_NOMEM; + pFilter = (Fts3SegFilter *)&pWriter[1]; + pCsr = (Fts3MultiSegReader *)&pFilter[1]; + + rc = fts3IncrmergeHintLoad(p, &hint); + while( rc==SQLITE_OK && nRem>0 ){ + const i64 nMod = FTS3_SEGDIR_MAXLEVEL * p->nIndex; + sqlite3_stmt *pFindLevel = 0; /* SQL used to determine iAbsLevel */ + int bUseHint = 0; /* True if attempting to append */ + + /* Search the %_segdir table for the absolute level with the smallest + ** relative level number that contains at least nMin segments, if any. + ** If one is found, set iAbsLevel to the absolute level number and + ** nSeg to nMin. If no level with at least nMin segments can be found, + ** set nSeg to -1. + */ + rc = fts3SqlStmt(p, SQL_FIND_MERGE_LEVEL, &pFindLevel, 0); + sqlite3_bind_int(pFindLevel, 1, nMin); + if( sqlite3_step(pFindLevel)==SQLITE_ROW ){ + iAbsLevel = sqlite3_column_int64(pFindLevel, 0); + nSeg = nMin; + }else{ + nSeg = -1; + } + rc = sqlite3_reset(pFindLevel); + + /* If the hint read from the %_stat table is not empty, check if the + ** last entry in it specifies a relative level smaller than or equal + ** to the level identified by the block above (if any). If so, this + ** iteration of the loop will work on merging at the hinted level. + */ + if( rc==SQLITE_OK && hint.n ){ + int nHint = hint.n; + sqlite3_int64 iHintAbsLevel = 0; /* Hint level */ + int nHintSeg = 0; /* Hint number of segments */ + + rc = fts3IncrmergeHintPop(&hint, &iHintAbsLevel, &nHintSeg); + if( nSeg<0 || (iAbsLevel % nMod) >= (iHintAbsLevel % nMod) ){ + iAbsLevel = iHintAbsLevel; + nSeg = nHintSeg; + bUseHint = 1; + bDirtyHint = 1; + }else{ + /* This undoes the effect of the HintPop() above - so that no entry + ** is removed from the hint blob. */ + hint.n = nHint; + } + } + + /* If nSeg is less that zero, then there is no level with at least + ** nMin segments and no hint in the %_stat table. No work to do. + ** Exit early in this case. */ + if( nSeg<0 ) break; + + /* Open a cursor to iterate through the contents of the oldest nSeg + ** indexes of absolute level iAbsLevel. If this cursor is opened using + ** the 'hint' parameters, it is possible that there are less than nSeg + ** segments available in level iAbsLevel. In this case, no work is + ** done on iAbsLevel - fall through to the next iteration of the loop + ** to start work on some other level. */ + memset(pWriter, 0, nAlloc); + pFilter->flags = FTS3_SEGMENT_REQUIRE_POS; + if( rc==SQLITE_OK ){ + rc = fts3IncrmergeCsr(p, iAbsLevel, nSeg, pCsr); + } + if( SQLITE_OK==rc && pCsr->nSegment==nSeg + && SQLITE_OK==(rc = sqlite3Fts3SegReaderStart(p, pCsr, pFilter)) + && SQLITE_ROW==(rc = sqlite3Fts3SegReaderStep(p, pCsr)) + ){ + int iIdx = 0; /* Largest idx in level (iAbsLevel+1) */ + rc = fts3IncrmergeOutputIdx(p, iAbsLevel, &iIdx); + if( rc==SQLITE_OK ){ + if( bUseHint && iIdx>0 ){ + const char *zKey = pCsr->zTerm; + int nKey = pCsr->nTerm; + rc = fts3IncrmergeLoad(p, iAbsLevel, iIdx-1, zKey, nKey, pWriter); + }else{ + rc = fts3IncrmergeWriter(p, iAbsLevel, iIdx, pCsr, pWriter); + } + } + + if( rc==SQLITE_OK && pWriter->nLeafEst ){ + fts3LogMerge(nSeg, iAbsLevel); + do { + rc = fts3IncrmergeAppend(p, pWriter, pCsr); + if( rc==SQLITE_OK ) rc = sqlite3Fts3SegReaderStep(p, pCsr); + if( pWriter->nWork>=nRem && rc==SQLITE_ROW ) rc = SQLITE_OK; + }while( rc==SQLITE_ROW ); + + /* Update or delete the input segments */ + if( rc==SQLITE_OK ){ + nRem -= (1 + pWriter->nWork); + rc = fts3IncrmergeChomp(p, iAbsLevel, pCsr, &nSeg); + if( nSeg!=0 ){ + bDirtyHint = 1; + fts3IncrmergeHintPush(&hint, iAbsLevel, nSeg, &rc); + } + } + } + + fts3IncrmergeRelease(p, pWriter, &rc); + } + + sqlite3Fts3SegReaderFinish(pCsr); + } + + /* Write the hint values into the %_stat table for the next incr-merger */ + if( bDirtyHint && rc==SQLITE_OK ){ + rc = fts3IncrmergeHintStore(p, &hint); + } + + sqlite3_free(pWriter); + sqlite3_free(hint.a); + return rc; +} + +/* +** Convert the text beginning at *pz into an integer and return +** its value. Advance *pz to point to the first character past +** the integer. +*/ +static int fts3Getint(const char **pz){ + const char *z = *pz; + int i = 0; + while( (*z)>='0' && (*z)<='9' ) i = 10*i + *(z++) - '0'; + *pz = z; + return i; +} + +/* +** Process statements of the form: +** +** INSERT INTO table(table) VALUES('merge=A,B'); +** +** A and B are integers that decode to be the number of leaf pages +** written for the merge, and the minimum number of segments on a level +** before it will be selected for a merge, respectively. +*/ +static int fts3DoIncrmerge( + Fts3Table *p, /* FTS3 table handle */ + const char *zParam /* Nul-terminated string containing "A,B" */ +){ + int rc; + int nMin = (FTS3_MERGE_COUNT / 2); + int nMerge = 0; + const char *z = zParam; + + /* Read the first integer value */ + nMerge = fts3Getint(&z); + + /* If the first integer value is followed by a ',', read the second + ** integer value. */ + if( z[0]==',' && z[1]!='\0' ){ + z++; + nMin = fts3Getint(&z); + } + + if( z[0]!='\0' || nMin<2 ){ + rc = SQLITE_ERROR; + }else{ + rc = SQLITE_OK; + if( !p->bHasStat ){ + assert( p->bFts4==0 ); + sqlite3Fts3CreateStatTable(&rc, p); + } + if( rc==SQLITE_OK ){ + rc = sqlite3Fts3Incrmerge(p, nMerge, nMin); + } + sqlite3Fts3SegmentsClose(p); + } + return rc; +} + +/* +** Process statements of the form: +** +** INSERT INTO table(table) VALUES('automerge=X'); +** +** where X is an integer. X==0 means to turn automerge off. X!=0 means +** turn it on. The setting is persistent. +*/ +static int fts3DoAutoincrmerge( + Fts3Table *p, /* FTS3 table handle */ + const char *zParam /* Nul-terminated string containing boolean */ +){ + int rc = SQLITE_OK; + sqlite3_stmt *pStmt = 0; + p->bAutoincrmerge = fts3Getint(&zParam)!=0; + if( !p->bHasStat ){ + assert( p->bFts4==0 ); + sqlite3Fts3CreateStatTable(&rc, p); + if( rc ) return rc; + } + rc = fts3SqlStmt(p, SQL_REPLACE_STAT, &pStmt, 0); + if( rc ) return rc;; + sqlite3_bind_int(pStmt, 1, FTS_STAT_AUTOINCRMERGE); + sqlite3_bind_int(pStmt, 2, p->bAutoincrmerge); + sqlite3_step(pStmt); + rc = sqlite3_reset(pStmt); + return rc; +} + +/* +** Return a 64-bit checksum for the FTS index entry specified by the +** arguments to this function. +*/ +static u64 fts3ChecksumEntry( + const char *zTerm, /* Pointer to buffer containing term */ + int nTerm, /* Size of zTerm in bytes */ + int iLangid, /* Language id for current row */ + int iIndex, /* Index (0..Fts3Table.nIndex-1) */ + i64 iDocid, /* Docid for current row. */ + int iCol, /* Column number */ + int iPos /* Position */ +){ + int i; + u64 ret = (u64)iDocid; + + ret += (ret<<3) + iLangid; + ret += (ret<<3) + iIndex; + ret += (ret<<3) + iCol; + ret += (ret<<3) + iPos; + for(i=0; i<nTerm; i++) ret += (ret<<3) + zTerm[i]; + + return ret; +} + +/* +** Return a checksum of all entries in the FTS index that correspond to +** language id iLangid. The checksum is calculated by XORing the checksums +** of each individual entry (see fts3ChecksumEntry()) together. +** +** If successful, the checksum value is returned and *pRc set to SQLITE_OK. +** Otherwise, if an error occurs, *pRc is set to an SQLite error code. The +** return value is undefined in this case. +*/ +static u64 fts3ChecksumIndex( + Fts3Table *p, /* FTS3 table handle */ + int iLangid, /* Language id to return cksum for */ + int iIndex, /* Index to cksum (0..p->nIndex-1) */ + int *pRc /* OUT: Return code */ +){ + Fts3SegFilter filter; + Fts3MultiSegReader csr; + int rc; + u64 cksum = 0; + + assert( *pRc==SQLITE_OK ); + + memset(&filter, 0, sizeof(filter)); + memset(&csr, 0, sizeof(csr)); + filter.flags = FTS3_SEGMENT_REQUIRE_POS|FTS3_SEGMENT_IGNORE_EMPTY; + filter.flags |= FTS3_SEGMENT_SCAN; + + rc = sqlite3Fts3SegReaderCursor( + p, iLangid, iIndex, FTS3_SEGCURSOR_ALL, 0, 0, 0, 1,&csr + ); + if( rc==SQLITE_OK ){ + rc = sqlite3Fts3SegReaderStart(p, &csr, &filter); + } + + if( rc==SQLITE_OK ){ + while( SQLITE_ROW==(rc = sqlite3Fts3SegReaderStep(p, &csr)) ){ + char *pCsr = csr.aDoclist; + char *pEnd = &pCsr[csr.nDoclist]; + + i64 iDocid = 0; + i64 iCol = 0; + i64 iPos = 0; + + pCsr += sqlite3Fts3GetVarint(pCsr, &iDocid); + while( pCsr<pEnd ){ + i64 iVal = 0; + pCsr += sqlite3Fts3GetVarint(pCsr, &iVal); + if( pCsr<pEnd ){ + if( iVal==0 || iVal==1 ){ + iCol = 0; + iPos = 0; + if( iVal ){ + pCsr += sqlite3Fts3GetVarint(pCsr, &iCol); + }else{ + pCsr += sqlite3Fts3GetVarint(pCsr, &iVal); + iDocid += iVal; + } + }else{ + iPos += (iVal - 2); + cksum = cksum ^ fts3ChecksumEntry( + csr.zTerm, csr.nTerm, iLangid, iIndex, iDocid, + (int)iCol, (int)iPos + ); + } + } + } + } + } + sqlite3Fts3SegReaderFinish(&csr); + + *pRc = rc; + return cksum; +} + +/* +** Check if the contents of the FTS index match the current contents of the +** content table. If no error occurs and the contents do match, set *pbOk +** to true and return SQLITE_OK. Or if the contents do not match, set *pbOk +** to false before returning. +** +** If an error occurs (e.g. an OOM or IO error), return an SQLite error +** code. The final value of *pbOk is undefined in this case. +*/ +static int fts3IntegrityCheck(Fts3Table *p, int *pbOk){ + int rc = SQLITE_OK; /* Return code */ + u64 cksum1 = 0; /* Checksum based on FTS index contents */ + u64 cksum2 = 0; /* Checksum based on %_content contents */ + sqlite3_stmt *pAllLangid = 0; /* Statement to return all language-ids */ + + /* This block calculates the checksum according to the FTS index. */ + rc = fts3SqlStmt(p, SQL_SELECT_ALL_LANGID, &pAllLangid, 0); + if( rc==SQLITE_OK ){ + int rc2; + sqlite3_bind_int(pAllLangid, 1, p->nIndex); + while( rc==SQLITE_OK && sqlite3_step(pAllLangid)==SQLITE_ROW ){ + int iLangid = sqlite3_column_int(pAllLangid, 0); + int i; + for(i=0; i<p->nIndex; i++){ + cksum1 = cksum1 ^ fts3ChecksumIndex(p, iLangid, i, &rc); + } + } + rc2 = sqlite3_reset(pAllLangid); + if( rc==SQLITE_OK ) rc = rc2; + } + + /* This block calculates the checksum according to the %_content table */ + rc = fts3SqlStmt(p, SQL_SELECT_ALL_LANGID, &pAllLangid, 0); + if( rc==SQLITE_OK ){ + sqlite3_tokenizer_module const *pModule = p->pTokenizer->pModule; + sqlite3_stmt *pStmt = 0; + char *zSql; + + zSql = sqlite3_mprintf("SELECT %s" , p->zReadExprlist); + if( !zSql ){ + rc = SQLITE_NOMEM; + }else{ + rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0); + sqlite3_free(zSql); + } + + while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){ + i64 iDocid = sqlite3_column_int64(pStmt, 0); + int iLang = langidFromSelect(p, pStmt); + int iCol; + + for(iCol=0; rc==SQLITE_OK && iCol<p->nColumn; iCol++){ + const char *zText = (const char *)sqlite3_column_text(pStmt, iCol+1); + int nText = sqlite3_column_bytes(pStmt, iCol+1); + sqlite3_tokenizer_cursor *pT = 0; + + rc = sqlite3Fts3OpenTokenizer(p->pTokenizer, iLang, zText, nText, &pT); + while( rc==SQLITE_OK ){ + char const *zToken; /* Buffer containing token */ + int nToken = 0; /* Number of bytes in token */ + int iDum1 = 0, iDum2 = 0; /* Dummy variables */ + int iPos = 0; /* Position of token in zText */ + + rc = pModule->xNext(pT, &zToken, &nToken, &iDum1, &iDum2, &iPos); + if( rc==SQLITE_OK ){ + int i; + cksum2 = cksum2 ^ fts3ChecksumEntry( + zToken, nToken, iLang, 0, iDocid, iCol, iPos + ); + for(i=1; i<p->nIndex; i++){ + if( p->aIndex[i].nPrefix<=nToken ){ + cksum2 = cksum2 ^ fts3ChecksumEntry( + zToken, p->aIndex[i].nPrefix, iLang, i, iDocid, iCol, iPos + ); + } + } + } + } + if( pT ) pModule->xClose(pT); + if( rc==SQLITE_DONE ) rc = SQLITE_OK; + } + } + + sqlite3_finalize(pStmt); + } + + *pbOk = (cksum1==cksum2); + return rc; +} + +/* +** Run the integrity-check. If no error occurs and the current contents of +** the FTS index are correct, return SQLITE_OK. Or, if the contents of the +** FTS index are incorrect, return SQLITE_CORRUPT_VTAB. +** +** Or, if an error (e.g. an OOM or IO error) occurs, return an SQLite +** error code. +** +** The integrity-check works as follows. For each token and indexed token +** prefix in the document set, a 64-bit checksum is calculated (by code +** in fts3ChecksumEntry()) based on the following: +** +** + The index number (0 for the main index, 1 for the first prefix +** index etc.), +** + The token (or token prefix) text itself, +** + The language-id of the row it appears in, +** + The docid of the row it appears in, +** + The column it appears in, and +** + The tokens position within that column. +** +** The checksums for all entries in the index are XORed together to create +** a single checksum for the entire index. +** +** The integrity-check code calculates the same checksum in two ways: +** +** 1. By scanning the contents of the FTS index, and +** 2. By scanning and tokenizing the content table. +** +** If the two checksums are identical, the integrity-check is deemed to have +** passed. +*/ +static int fts3DoIntegrityCheck( + Fts3Table *p /* FTS3 table handle */ +){ + int rc; + int bOk = 0; + rc = fts3IntegrityCheck(p, &bOk); + if( rc==SQLITE_OK && bOk==0 ) rc = SQLITE_CORRUPT_VTAB; + return rc; +} + +/* ** Handle a 'special' INSERT of the form: ** ** "INSERT INTO tbl(tbl) VALUES(<expr>)" @@ -2928,6 +5026,14 @@ static int fts3SpecialInsert(Fts3Table *p, sqlite3_value *pVal){ return SQLITE_NOMEM; }else if( nVal==8 && 0==sqlite3_strnicmp(zVal, "optimize", 8) ){ rc = fts3DoOptimize(p, 0); + }else if( nVal==7 && 0==sqlite3_strnicmp(zVal, "rebuild", 7) ){ + rc = fts3DoRebuild(p); + }else if( nVal==15 && 0==sqlite3_strnicmp(zVal, "integrity-check", 15) ){ + rc = fts3DoIntegrityCheck(p); + }else if( nVal>6 && 0==sqlite3_strnicmp(zVal, "merge=", 6) ){ + rc = fts3DoIncrmerge(p, &zVal[6]); + }else if( nVal>10 && 0==sqlite3_strnicmp(zVal, "automerge=", 10) ){ + rc = fts3DoAutoincrmerge(p, &zVal[10]); #ifdef SQLITE_TEST }else if( nVal>9 && 0==sqlite3_strnicmp(zVal, "nodesize=", 9) ){ p->nNodeSize = atoi(&zVal[9]); @@ -2943,6 +5049,7 @@ static int fts3SpecialInsert(Fts3Table *p, sqlite3_value *pVal){ return rc; } +#ifndef SQLITE_DISABLE_FTS4_DEFERRED /* ** Delete all cached deferred doclists. Deferred doclists are cached ** (allocated) by the sqlite3Fts3CacheDeferredDoclists() function. @@ -2996,18 +5103,18 @@ int sqlite3Fts3CacheDeferredDoclists(Fts3Cursor *pCsr){ const char *zText = (const char *)sqlite3_column_text(pCsr->pStmt, i+1); sqlite3_tokenizer_cursor *pTC = 0; - rc = pModule->xOpen(pT, zText, -1, &pTC); + rc = sqlite3Fts3OpenTokenizer(pT, pCsr->iLangid, zText, -1, &pTC); while( rc==SQLITE_OK ){ char const *zToken; /* Buffer containing token */ - int nToken; /* Number of bytes in token */ - int iDum1, iDum2; /* Dummy variables */ - int iPos; /* Position of token in zText */ + int nToken = 0; /* Number of bytes in token */ + int iDum1 = 0, iDum2 = 0; /* Dummy variables */ + int iPos = 0; /* Position of token in zText */ - pTC->pTokenizer = pT; rc = pModule->xNext(pTC, &zToken, &nToken, &iDum1, &iDum2, &iPos); for(pDef=pCsr->pDeferred; pDef && rc==SQLITE_OK; pDef=pDef->pNext){ Fts3PhraseToken *pPT = pDef->pToken; if( (pDef->iCol>=p->nColumn || pDef->iCol==i) + && (pPT->bFirst==0 || iPos==0) && (pPT->n==nToken || (pPT->isPrefix && pPT->n<nToken)) && (0==memcmp(zToken, pPT->z, pPT->n)) ){ @@ -3080,6 +5187,7 @@ int sqlite3Fts3DeferToken( return SQLITE_OK; } +#endif /* ** SQLite value pRowid contains the rowid of a row that may or may not be @@ -3089,26 +5197,32 @@ int sqlite3Fts3DeferToken( static int fts3DeleteByRowid( Fts3Table *p, sqlite3_value *pRowid, - int *pnDoc, + int *pnChng, /* IN/OUT: Decrement if row is deleted */ u32 *aSzDel ){ - int isEmpty = 0; - int rc = fts3IsEmpty(p, pRowid, &isEmpty); - if( rc==SQLITE_OK ){ - if( isEmpty ){ - /* Deleting this row means the whole table is empty. In this case - ** delete the contents of all three tables and throw away any - ** data in the pendingTerms hash table. */ - rc = fts3DeleteAll(p); - *pnDoc = *pnDoc - 1; - }else{ - sqlite3_int64 iRemove = sqlite3_value_int64(pRowid); - rc = fts3PendingTermsDocid(p, iRemove); - fts3DeleteTerms(&rc, p, pRowid, aSzDel); - fts3SqlExec(&rc, p, SQL_DELETE_CONTENT, &pRowid); - if( sqlite3_changes(p->db) ) *pnDoc = *pnDoc - 1; - if( p->bHasDocsize ){ - fts3SqlExec(&rc, p, SQL_DELETE_DOCSIZE, &pRowid); + int rc = SQLITE_OK; /* Return code */ + int bFound = 0; /* True if *pRowid really is in the table */ + + fts3DeleteTerms(&rc, p, pRowid, aSzDel, &bFound); + if( bFound && rc==SQLITE_OK ){ + int isEmpty = 0; /* Deleting *pRowid leaves the table empty */ + rc = fts3IsEmpty(p, pRowid, &isEmpty); + if( rc==SQLITE_OK ){ + if( isEmpty ){ + /* Deleting this row means the whole table is empty. In this case + ** delete the contents of all three tables and throw away any + ** data in the pendingTerms hash table. */ + rc = fts3DeleteAll(p, 1); + *pnChng = 0; + memset(aSzDel, 0, sizeof(u32) * (p->nColumn+1) * 2); + }else{ + *pnChng = *pnChng - 1; + if( p->zContentTbl==0 ){ + fts3SqlExec(&rc, p, SQL_DELETE_CONTENT, &pRowid); + } + if( p->bHasDocsize ){ + fts3SqlExec(&rc, p, SQL_DELETE_DOCSIZE, &pRowid); + } } } } @@ -3118,7 +5232,16 @@ static int fts3DeleteByRowid( /* ** This function does the work for the xUpdate method of FTS3 virtual -** tables. +** tables. The schema of the virtual table being: +** +** CREATE TABLE <table name>( +** <user columns>, +** <table name> HIDDEN, +** docid HIDDEN, +** <langid> HIDDEN +** ); +** +** */ int sqlite3Fts3UpdateMethod( sqlite3_vtab *pVtab, /* FTS3 vtab object */ @@ -3129,13 +5252,16 @@ int sqlite3Fts3UpdateMethod( Fts3Table *p = (Fts3Table *)pVtab; int rc = SQLITE_OK; /* Return Code */ int isRemove = 0; /* True for an UPDATE or DELETE */ - sqlite3_int64 iRemove = 0; /* Rowid removed by UPDATE or DELETE */ u32 *aSzIns = 0; /* Sizes of inserted documents */ - u32 *aSzDel; /* Sizes of deleted documents */ + u32 *aSzDel = 0; /* Sizes of deleted documents */ int nChng = 0; /* Net change in number of documents */ int bInsertDone = 0; assert( p->pSegments==0 ); + assert( + nArg==1 /* DELETE operations */ + || nArg==(2 + p->nColumn + 3) /* INSERT or UPDATE operations */ + ); /* Check for a "special" INSERT operation. One of the form: ** @@ -3149,14 +5275,19 @@ int sqlite3Fts3UpdateMethod( goto update_out; } + if( nArg>1 && sqlite3_value_int(apVal[2 + p->nColumn + 2])<0 ){ + rc = SQLITE_CONSTRAINT; + goto update_out; + } + /* Allocate space to hold the change in document sizes */ - aSzIns = sqlite3_malloc( sizeof(aSzIns[0])*(p->nColumn+1)*2 ); - if( aSzIns==0 ){ + aSzDel = sqlite3_malloc( sizeof(aSzDel[0])*(p->nColumn+1)*2 ); + if( aSzDel==0 ){ rc = SQLITE_NOMEM; goto update_out; } - aSzDel = &aSzIns[p->nColumn+1]; - memset(aSzIns, 0, sizeof(aSzIns[0])*(p->nColumn+1)*2); + aSzIns = &aSzDel[p->nColumn+1]; + memset(aSzDel, 0, sizeof(aSzDel[0])*(p->nColumn+1)*2); /* If this is an INSERT operation, or an UPDATE that modifies the rowid ** value, then this operation requires constraint handling. @@ -3167,7 +5298,7 @@ int sqlite3Fts3UpdateMethod( ** detect the conflict and return SQLITE_CONSTRAINT before beginning to ** modify the database file. */ - if( nArg>1 ){ + if( nArg>1 && p->zContentTbl==0 ){ /* Find the value object that holds the new rowid value. */ sqlite3_value *pNewRowid = apVal[3+p->nColumn]; if( sqlite3_value_type(pNewRowid)==SQLITE_NULL ){ @@ -3212,20 +5343,23 @@ int sqlite3Fts3UpdateMethod( assert( sqlite3_value_type(apVal[0])==SQLITE_INTEGER ); rc = fts3DeleteByRowid(p, apVal[0], &nChng, aSzDel); isRemove = 1; - iRemove = sqlite3_value_int64(apVal[0]); } /* If this is an INSERT or UPDATE operation, insert the new record. */ if( nArg>1 && rc==SQLITE_OK ){ + int iLangid = sqlite3_value_int(apVal[2 + p->nColumn + 2]); if( bInsertDone==0 ){ rc = fts3InsertData(p, apVal, pRowid); - if( rc==SQLITE_CONSTRAINT ) rc = SQLITE_CORRUPT_VTAB; + if( rc==SQLITE_CONSTRAINT && p->zContentTbl==0 ){ + rc = FTS_CORRUPT_VTAB; + } } - if( rc==SQLITE_OK && (!isRemove || *pRowid!=iRemove) ){ - rc = fts3PendingTermsDocid(p, *pRowid); + if( rc==SQLITE_OK && (!isRemove || *pRowid!=p->iPrevDocid ) ){ + rc = fts3PendingTermsDocid(p, iLangid, *pRowid); } if( rc==SQLITE_OK ){ - rc = fts3InsertTerms(p, apVal, aSzIns); + assert( p->iPrevDocid==*pRowid ); + rc = fts3InsertTerms(p, iLangid, apVal, aSzIns); } if( p->bHasDocsize ){ fts3InsertDocsize(&rc, p, aSzIns); @@ -3233,12 +5367,12 @@ int sqlite3Fts3UpdateMethod( nChng++; } - if( p->bHasStat ){ + if( p->bFts4 ){ fts3UpdateDocTotals(&rc, p, aSzIns, aSzDel, nChng); } update_out: - sqlite3_free(aSzIns); + sqlite3_free(aSzDel); sqlite3Fts3SegmentsClose(p); return rc; } |