继续上一篇索引创建流程完成,学习理解一下查询关键词的逻辑处理流程。 索引创建流程可参考:https://cloud.tencent.com/developer/article/1150671
索引查询主要源码是在searchd.cpp文件中。
ServedDesc_t tIdx;
// check path
if ( !hIndex.Exists ( "path" ) )
{
sphWarning ( "index '%s': key 'path' not found - NOT SERVING", szIndexName );
return ADD_ERROR;
}
// check name
if ( g_pLocalIndexes->Exists ( szIndexName ) )
{
sphWarning ( "index '%s': duplicate name - NOT SERVING", szIndexName );
return ADD_ERROR;
}
// configure memlocking, star
ConfigureLocalIndex ( tIdx, hIndex );
// try to create index
tIdx.m_sIndexPath = hIndex["path"].strval();
PreCreatePlainIndex ( tIdx, szIndexName );
tIdx.m_pIndex->SetCacheSize ( g_iMaxCachedDocs, g_iMaxCachedHits );
CSphIndexStatus tStatus;
tIdx.m_pIndex->GetStatus ( &tStatus );
tIdx.m_iMass = CalculateMass ( tStatus );
// done
if ( !g_pLocalIndexes->Add ( tIdx, szIndexName ) )
{
sphWarning ( "INTERNAL ERROR: index '%s': hash add failed - NOT SERVING", szIndexName );
return ADD_ERROR;
}
// leak pointer, so it's destructor won't delete it
tIdx.m_pIndex = NULL;
return ADD_LOCAL;
struct ServedDesc_t
{
CSphIndex * m_pIndex;
CSphString m_sIndexPath;
CSphString m_sNewPath;
bool m_bEnabled; ///< to disable index in cases when rotation fails
bool m_bMlock;
bool m_bPreopen;
bool m_bExpand;
bool m_bToDelete;
bool m_bOnlyNew;
bool m_bRT;
CSphString m_sGlobalIDFPath;
bool m_bOnDiskAttrs;
bool m_bOnDiskPools;
int64_t m_iMass; // relative weight (by access speed) of the index
};
void ConfigureLocalIndex ( ServedDesc_t & tIdx, const CSphConfigSection & hIndex )
{
tIdx.m_bMlock = ( hIndex.GetInt ( "mlock", 0 )!=0 ) && !g_bOptNoLock;
tIdx.m_bExpand = ( hIndex.GetInt ( "expand_keywords", 0 )!=0 );
tIdx.m_bPreopen = ( hIndex.GetInt ( "preopen", 0 )!=0 );
tIdx.m_sGlobalIDFPath = hIndex.GetStr ( "global_idf" );
tIdx.m_bOnDiskAttrs = ( hIndex.GetInt ( "ondisk_attrs", 0 )==1 );
tIdx.m_bOnDiskPools = ( strcmp ( hIndex.GetStr ( "ondisk_attrs", "" ), "pool" )==0 );
tIdx.m_bOnDiskAttrs |= g_bOnDiskAttrs;
tIdx.m_bOnDiskPools |= g_bOnDiskPools;
}
void PreCreatePlainIndex ( ServedDesc_t & tServed, const char * sName )
{
tServed.m_pIndex = sphCreateIndexPhrase ( sName, tServed.m_sIndexPath.cstr() );
tServed.m_pIndex->m_bExpandKeywords = tServed.m_bExpand;
tServed.m_pIndex->m_iExpansionLimit = g_iExpansionLimit;
tServed.m_pIndex->SetPreopen ( tServed.m_bPreopen || g_bPreopenIndexes );
tServed.m_pIndex->SetGlobalIDFPath ( tServed.m_sGlobalIDFPath );
tServed.m_pIndex->SetMemorySettings ( tServed.m_bMlock, tServed.m_bOnDiskAttrs, tServed.m_bOnDiskPools );
tServed.m_bEnabled = false;
}
CSphIndex * sphCreateIndexPhrase ( const char* szIndexName, const char * sFilename )
{
return new CSphIndex_VLN ( szIndexName, sFilename );
}
/// this gets called for every new physical index
/// that is, local and RT indexes, but not distributed once
bool PrereadNewIndex ( ServedDesc_t & tIdx, const CSphConfigSection & hIndex, const char * szIndexName )
{
bool bOk = tIdx.m_pIndex->Prealloc ( g_bStripPath );
if ( !bOk )
{
sphWarning ( "index '%s': prealloc: %s; NOT SERVING", szIndexName, tIdx.m_pIndex->GetLastError().cstr() );
return false;
}
}
CSphEmbeddedFiles tEmbeddedFiles;
// preload schema
if ( !LoadHeader ( GetIndexFileName("sph").cstr(), bStripPath, tEmbeddedFiles, m_sLastWarning ) )
return false;
tEmbeddedFiles.Reset();
// preopen
if ( m_bKeepFilesOpen )
{
if ( m_tDoclistFile.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError ) < 0 )
return false;
if ( m_tHitlistFile.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, m_sLastError ) < 0 )
return false;
}
/////////////////////
// prealloc wordlist
/////////////////////
// might be no dictionary at this point for old index format
bool bWordDict = m_pDict && m_pDict->GetSettings().m_bWordDict;
// only checkpoint and wordlist infixes are actually read here; dictionary itself is just mapped
if ( !m_tWordlist.Preread ( GetIndexFileName("spi").cstr() , m_uVersion, bWordDict, m_sLastError ) )
return false;
}
if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_bIsEmpty )
{
/////////////
// attr data
/////////////
int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
if ( !m_tAttr.Setup ( GetIndexFileName("spa").cstr(), m_sLastError, true ) )
return false;
int64_t iDocinfoSize = m_tAttr.GetLengthBytes();
if ( iDocinfoSize<0 )
return false;
iDocinfoSize = iDocinfoSize / sizeof(DWORD);
int64_t iRealDocinfoSize = m_iMinMaxIndex ? m_iMinMaxIndex : iDocinfoSize;
m_iDocinfo = iRealDocinfoSize / iStride;
m_iDocinfoIndex = ( ( iDocinfoSize - iRealDocinfoSize ) / iStride / 2 ) - 1;
m_pDocinfoIndex = m_tAttr.GetWritePtr() + m_iMinMaxIndex;
// prealloc docinfo hash but only if docinfo is big enough (in other words if hash is 8x+ less in size)
if ( m_tAttr.GetLengthBytes() > ( 32 << DOCINFO_HASH_BITS ) && !m_bDebugCheck )
{
if ( !m_tDocinfoHash.Alloc ( ( 1 << DOCINFO_HASH_BITS )+4, m_sLastError ) )
return false;
}
// MVA data
if ( m_uVersion>=4 )
{
if ( !m_tMva.Setup ( GetIndexFileName("spm").cstr(), m_sLastError, false ) )
return false;
if ( m_tMva.GetNumEntries()>INT_MAX )
{
m_bArenaProhibit = true;
sphWarning ( "MVA update disabled (loaded MVA " INT64_FMT ", should be less %d)", m_tMva.GetNumEntries(), INT_MAX );
}
}
if ( m_uVersion>=17 && !m_tString.Setup ( GetIndexFileName("sps").cstr(), m_sLastError, true ) )
return false;
}
// prealloc killlist
if ( m_uVersion>=10 )
{
// FIXME!!! m_bId32to64
if ( !m_tKillList.Setup ( GetIndexFileName("spk").cstr(), m_sLastError, false ) )
return false;
}
// prealloc skiplist
if ( !m_bDebugCheck && m_bHaveSkips && !m_tSkiplists.Setup ( GetIndexFileName("spe").cstr(), m_sLastError, false ) )
return false;
// almost done
m_bPassedAlloc = true;
m_iIndexTag = ++m_iIndexTagSeq;
/// one regular query vs many sorters
bool CSphIndex_VLN::MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult,
int iSorters, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const
XQQuery_t tParsed;
if ( !sphParseExtendedQuery ( tParsed, (const char*)sModifiedQuery, pQuery, m_pQueryTokenizer, &m_tSchema, pDict, m_tSettings ) )
{
// FIXME? might wanna reset profile to unknown state
pResult->m_sError = tParsed.m_sParseError;
return false;
}
if ( !tParsed.m_sParseWarning.IsEmpty() )
pResult->m_sWarning = tParsed.m_sParseWarning;
// transform query if needed (quorum transform, etc.)
if ( pProfile )
pProfile->Switch ( SPH_QSTATE_TRANSFORMS );
sphTransformExtendedQuery ( &tParsed.m_pRoot, m_tSettings, pQuery->m_bSimplify, this );
if ( m_bExpandKeywords )
{
tParsed.m_pRoot = sphQueryExpandKeywords ( tParsed.m_pRoot, m_tSettings );
tParsed.m_pRoot->Check ( true );
}
// this should be after keyword expansion
if ( m_tSettings.m_uAotFilterMask )
TransformAotFilter ( tParsed.m_pRoot, pDict->GetWordforms(), m_tSettings );
SphWordStatChecker_t tStatDiff;
tStatDiff.Set ( pResult->m_hWordStats );
// expanding prefix in word dictionary case
CSphScopedPayload tPayloads;
XQNode_t * pPrefixed = ExpandPrefix ( tParsed.m_pRoot, pResult, &tPayloads, pQuery->m_uDebugFlags );
if ( !pPrefixed )
return false;
tParsed.m_pRoot = pPrefixed;
if ( !sphCheckQueryHeight ( tParsed.m_pRoot, pResult->m_sError ) )
return false;
// flag common subtrees
int iCommonSubtrees = 0;
if ( m_iMaxCachedDocs && m_iMaxCachedHits )
iCommonSubtrees = sphMarkCommonSubtrees ( 1, &tParsed );
tParsed.m_bNeedSZlist = pQuery->m_bZSlist;
CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );
bool bResult = ParsedMultiQuery ( pQuery, pResult, iSorters, &dSorters[0], tParsed, pDict, tArgs, &tNodeCache, tStatDiff );
// open files
CSphAutofile tDoclist, tHitlist;
if ( !m_bKeepFilesOpen )
{
if ( pProfile )
pProfile->Switch ( SPH_QSTATE_OPEN );
if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, pResult->m_sError ) < 0 )
return false;
if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, pResult->m_sError ) < 0 )
return false;
}
// setup search terms
DiskIndexQwordSetup_c tTermSetup ( m_bKeepFilesOpen ? m_tDoclistFile : tDoclist,
m_bKeepFilesOpen ? m_tHitlistFile : tHitlist,
m_tSkiplists.GetWritePtr(), pProfile );
tTermSetup.m_pDict = pDict;
tTermSetup.m_pIndex = this;
tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
tTermSetup.m_uMinDocid = m_uMinDocid;
if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
{
tTermSetup.m_iInlineRowitems = m_tSchema.GetRowSize();
tTermSetup.m_pMinRow = m_dMinRow.Begin();
}
tTermSetup.m_iDynamicRowitems = ppSorters[iMaxSchemaIndex]->GetSchema().GetDynamicSize();
if ( pQuery->m_uMaxQueryMsec>0 )
tTermSetup.m_iMaxTimer = sphMicroTimer() + pQuery->m_uMaxQueryMsec*1000; // max_query_time
tTermSetup.m_pWarning = &pResult->m_sWarning;
tTermSetup.m_bSetupReaders = true;
tTermSetup.m_pCtx = &tCtx;
tTermSetup.m_pNodeCache = pNodeCache;
// setup prediction constrain
CSphQueryStats tQueryStats;
bool bCollectPredictionCounters = ( pQuery->m_iMaxPredictedMsec>0 );
int64_t iNanoBudget = (int64_t)(pQuery->m_iMaxPredictedMsec) * 1000000; // from milliseconds to nanoseconds
tQueryStats.m_pNanoBudget = &iNanoBudget;
if ( bCollectPredictionCounters )
tTermSetup.m_pStats = &tQueryStats;
// setup query
// must happen before index-level reject, in order to build proper keyword stats
CSphScopedPtr<ISphRanker> pRanker ( sphCreateRanker ( tXQ, pQuery, pResult, tTermSetup, tCtx, ppSorters[iMaxSchemaIndex]->GetSchema() ) );
if ( !pRanker.Ptr() )
return false;
case SPH_RANK_MATCHANY: pRanker = new ExtRanker_T < RankerState_MatchAny_fn > ( tXQ, tTermSetup ); break;
ExtRanker_c::ExtRanker_c ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup )
: m_dZoneInfo ( 0 )
{
assert ( tSetup.m_pCtx );
m_iInlineRowitems = tSetup.m_iInlineRowitems;
for ( int i=0; i<ExtNode_i::MAX_DOCS; i++ )
{
m_dMatches[i].Reset ( tSetup.m_iDynamicRowitems );
m_dMyMatches[i].Reset ( tSetup.m_iDynamicRowitems );
}
m_tTestMatch.Reset ( tSetup.m_iDynamicRowitems );
assert ( tXQ.m_pRoot );
tSetup.m_pZoneChecker = this;
m_pRoot = ExtNode_i::Create ( tXQ.m_pRoot, tSetup );
}
ExtNode_i * ExtNode_i::Create ( const XQNode_t * pNode, const ISphQwordSetup & tSetup )
{
// generic create
ExtNode_i * pCur = NULL;
for ( int i=0; i<iChildren; i++ )
{
ExtNode_i * pNext = ExtNode_i::Create ( pNode->m_dChildren[i], tSetup );
if ( !pNext ) continue;
if ( !pCur )
{
pCur = pNext;
continue;
}
switch ( pNode->GetOp() )
{
case SPH_QUERY_OR: pCur = new ExtOr_c ( pCur, pNext, tSetup ); break;
case SPH_QUERY_MAYBE: pCur = new ExtMaybe_c ( pCur, pNext, tSetup ); break;
case SPH_QUERY_AND: pCur = new ExtAnd_c ( pCur, pNext, tSetup ); break;
case SPH_QUERY_ANDNOT: pCur = new ExtAndNot_c ( pCur, pNext, tSetup ); break;
case SPH_QUERY_SENTENCE: pCur = new ExtUnit_c ( pCur, pNext, pNode->m_dSpec.m_dFieldMask, tSetup, MAGIC_WORD_SENTENCE ); break;
case SPH_QUERY_PARAGRAPH: pCur = new ExtUnit_c ( pCur, pNext, pNode->m_dSpec.m_dFieldMask, tSetup, MAGIC_WORD_PARAGRAPH ); break;
default: assert ( 0 && "internal error: unhandled op in ExtNode_i::Create()" ); break;
}
}
if ( pCur && pNode->GetCount() )
return tSetup.m_pNodeCache->CreateProxy ( pCur, pNode, tSetup );
return pCur;
}
ExtNode_i * ExtNode_i::Create ( const XQKeyword_t & tWord, const XQNode_t * pNode, const ISphQwordSetup & tSetup )
{
return Create ( CreateQueryWord ( tWord, tSetup ), pNode, tSetup );
}
static ISphQword * CreateQueryWord ( const XQKeyword_t & tWord, const ISphQwordSetup & tSetup, CSphDict * pZonesDict=NULL )
{
BYTE sTmp [ 3*SPH_MAX_WORD_LEN + 16 ];
strncpy ( (char*)sTmp, tWord.m_sWord.cstr(), sizeof(sTmp) );
sTmp[sizeof(sTmp)-1] = '\0';
ISphQword * pWord = tSetup.QwordSpawn ( tWord );
pWord->m_sWord = tWord.m_sWord;
CSphDict * pDict = pZonesDict ? pZonesDict : tSetup.m_pDict;
pWord->m_uWordID = tWord.m_bMorphed
? pDict->GetWordIDNonStemmed ( sTmp )
: pDict->GetWordID ( sTmp );
pWord->m_sDictWord = (char*)sTmp;
pWord->m_bExpanded = tWord.m_bExpanded;
tSetup.QwordSetup ( pWord );
if ( tWord.m_bFieldStart && tWord.m_bFieldEnd ) pWord->m_iTermPos = TERM_POS_FIELD_STARTEND;
else if ( tWord.m_bFieldStart ) pWord->m_iTermPos = TERM_POS_FIELD_START;
else if ( tWord.m_bFieldEnd ) pWord->m_iTermPos = TERM_POS_FIELD_END;
else pWord->m_iTermPos = TERM_POS_NONE;
pWord->m_fBoost = tWord.m_fBoost;
pWord->m_iAtomPos = tWord.m_iAtomPos;
return pWord;
}
bool DiskIndexQwordSetup_c::QwordSetup ( ISphQword * pWord ) const
{
DiskIndexQwordTraits_c * pMyWord = (DiskIndexQwordTraits_c*)pWord;
// setup attrs
pMyWord->m_tDoc.Reset ( m_iDynamicRowitems );
pMyWord->m_iMinID = m_uMinDocid;
pMyWord->m_tDoc.m_uDocID = m_uMinDocid;
return pMyWord->Setup ( this );
}
bool DiskIndexQwordSetup_c::Setup ( ISphQword * pWord ) const
{
const CSphWordlistCheckpoint * pCheckpoint = pIndex->m_tWordlist.FindCheckpoint ( sWord, iWordLen, tWord.m_uWordID, false );
if ( !pCheckpoint )
return false;
// decode wordlist chunk
const BYTE * pBuf = pIndex->m_tWordlist.AcquireDict ( pCheckpoint );
assert ( pBuf );
CSphDictEntry tRes;
if ( bWordDict )
{
KeywordsBlockReader_c tCtx ( pBuf, m_pSkips!=NULL );
while ( tCtx.UnpackWord() )
{
// block is sorted
// so once keywords are greater than the reference word, no more matches
assert ( tCtx.GetWordLen()>0 );
int iCmp = sphDictCmpStrictly ( sWord, iWordLen, tCtx.GetWord(), tCtx.GetWordLen() );
if ( iCmp<0 )
return false;
if ( iCmp==0 )
break;
}
if ( tCtx.GetWordLen()<=0 )
return false;
tRes = tCtx;
} else
{
if ( !pIndex->m_tWordlist.GetWord ( pBuf, tWord.m_uWordID, tRes ) )
return false;
}
const ESphHitless eMode = pIndex->m_tSettings.m_eHitless;
tWord.m_iDocs = eMode==SPH_HITLESS_SOME ? ( tRes.m_iDocs & HITLESS_DOC_MASK ) : tRes.m_iDocs;
tWord.m_iHits = tRes.m_iHits;
tWord.m_bHasHitlist =
( eMode==SPH_HITLESS_NONE ) ||
( eMode==SPH_HITLESS_SOME && !( tRes.m_iDocs & HITLESS_DOC_FLAG ) );
if ( m_bSetupReaders )
{
tWord.m_rdDoclist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
tWord.m_rdDoclist.SetFile ( m_tDoclist );
tWord.m_rdDoclist.m_pProfile = m_pProfile;
tWord.m_rdDoclist.m_eProfileState = SPH_QSTATE_READ_DOCS;
// read in skiplist
// OPTIMIZE? maybe cache hot decompressed lists?
// OPTIMIZE? maybe add an option to decompress on preload instead?
if ( m_pSkips && tRes.m_iDocs>SPH_SKIPLIST_BLOCK )
{
const BYTE * pSkip = m_pSkips + tRes.m_iSkiplistOffset;
tWord.m_dSkiplist.Add();
tWord.m_dSkiplist.Last().m_iBaseDocid = 0;
tWord.m_dSkiplist.Last().m_iOffset = tRes.m_iDoclistOffset;
tWord.m_dSkiplist.Last().m_iBaseHitlistPos = 0;
for ( int i=1; i<( tWord.m_iDocs/SPH_SKIPLIST_BLOCK ); i++ )
{
SkiplistEntry_t & t = tWord.m_dSkiplist.Add();
SkiplistEntry_t & p = tWord.m_dSkiplist [ tWord.m_dSkiplist.GetLength()-2 ];
t.m_iBaseDocid = p.m_iBaseDocid + SPH_SKIPLIST_BLOCK + (SphDocID_t) sphUnzipOffset ( pSkip );
t.m_iOffset = p.m_iOffset + 4*SPH_SKIPLIST_BLOCK + sphUnzipOffset ( pSkip );
t.m_iBaseHitlistPos = p.m_iBaseHitlistPos + sphUnzipOffset ( pSkip );
}
}
tWord.m_rdDoclist.SeekTo ( tRes.m_iDoclistOffset, tRes.m_iDoclistHint );
tWord.m_rdHitlist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
tWord.m_rdHitlist.SetFile ( m_tHitlist );
tWord.m_rdHitlist.m_pProfile = m_pProfile;
tWord.m_rdHitlist.m_eProfileState = SPH_QSTATE_READ_HITS;
}
}
/// dictionary entry
/// some of the fields might be unused depending on specific dictionary type
struct CSphDictEntry
{
SphWordID_t m_uWordID; ///< keyword id (for dict=crc)
const BYTE * m_sKeyword; ///< keyword text (for dict=keywords)
int m_iDocs; ///< number of matching documents
int m_iHits; ///< number of occurrences
SphOffset_t m_iDoclistOffset; ///< absolute document list offset (into .spd)
SphOffset_t m_iDoclistLength; ///< document list length in bytes
SphOffset_t m_iSkiplistOffset; ///< absolute skiplist offset (into .spe)
int m_iDoclistHint; ///< raw document list length hint value (0..255 range, 1 byte)
};
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。