public Core.SFQL.Parse.DocumentResultWhereDictionary Search() { Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Search of Match1"); Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary(); if (_QueryWords.Count <= 0 || _WordIndexes.Length <= 0) { if (_QueryParameter.Not && UpDict != null) { return(UpDict); } else { return(result); } } if (this._QueryParameter.Not) { if (_InvertedIndex.IndexMode == Field.IndexMode.Simple) { Calculate(null, ref result, _WordIndexes); } else { CalculateWithPosition(null, ref result, _WordIndexes); } } else { if (_InvertedIndex.IndexMode == Field.IndexMode.Simple) { Calculate(this.UpDict, ref result, _WordIndexes); } else { CalculateWithPosition(this.UpDict, ref result, _WordIndexes); } } if (this._QueryParameter.Not) { result.Not = true; if (UpDict != null) { result = result.AndMergeForNot(result, UpDict); } } performanceReport.Stop(); return(result); }
unsafe public void CalculateOptimize(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank) { if (Argument.IsOrderByScoreDesc()) { CalculateOptimizeOrderByScoreDesc(upDict, ref docIdRank); } else { CalculateOptimizeNormalOrderBy(upDict, ref docIdRank); } }
public Core.SFQL.Parse.DocumentResultWhereDictionary Search() { Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Search of Contains"); List <WordIndexForQuery[]> partList = GetAllPartOfWordIndexes(); if (_QueryWords.Count <= 0 || partList.Count <= 0) { return(PartSearch(new WordIndexForQuery[0])); } Core.SFQL.Parse.DocumentResultWhereDictionary result = PartSearch(partList[0]); for (int i = 1; i < partList.Count; i++) { bool someWordNoResult = false; foreach (WordIndexForQuery w in partList[i]) { if (w.WordIndex.WordCount == 0) { someWordNoResult = true; break; } w.WordIndex.Reset(); } if (!someWordNoResult) { result.OrMerge(result, PartSearch(partList[i])); } } if (this._QueryParameter.Not) { result.Not = true; if (UpDict != null) { result = result.AndMergeForNot(result, UpDict); } } performanceReport.Stop(); return(result); }
/// <summary> /// And merge when the Not property is false both of fst and sec /// </summary> /// <param name="and"></param> /// <param name="or"></param> /// <returns></returns> unsafe public Core.SFQL.Parse.DocumentResultWhereDictionary AndMergeDict(Core.SFQL.Parse.DocumentResultWhereDictionary fst, Core.SFQL.Parse.DocumentResultWhereDictionary sec) { if (fst == null) { return(new Core.SFQL.Parse.DocumentResultWhereDictionary()); } if (sec == null) { return(new DocumentResultWhereDictionary()); } Core.SFQL.Parse.DocumentResultWhereDictionary src; Core.SFQL.Parse.DocumentResultWhereDictionary dest; if (fst.Count > sec.Count) { src = sec; dest = fst; } else { src = fst; dest = sec; } Core.SFQL.Parse.DocumentResultWhereDictionary result = new DocumentResultWhereDictionary(); result.Not = dest.Not; foreach (Core.SFQL.Parse.DocumentResultPoint drp in src.Values) { Query.DocumentResult *dr; if (dest.TryGetValue(drp.pDocumentResult->DocId, out dr)) { dr->Score += drp.pDocumentResult->Score; if (dr->PayloadData == null && drp.pDocumentResult->PayloadData != null) { dr->PayloadData = drp.pDocumentResult->PayloadData; } result.Add(drp.pDocumentResult->DocId, *dr); } } return(result); }
private Core.SFQL.Parse.DocumentResultWhereDictionary PartSearch(WordIndexForQuery[] wordIndexes) { Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary(); if (_QueryWords.Count <= 0 || wordIndexes.Length <= 0) { if (_QueryParameter.Not && UpDict != null) { return(UpDict); } else { return(result); } } if (this._QueryParameter.Not) { if (_InvertedIndex.IndexMode == Field.IndexMode.Simple) { Calculate(null, ref result, wordIndexes); } else { CalculateWithPosition(null, ref result, wordIndexes); } } else { if (_InvertedIndex.IndexMode == Field.IndexMode.Simple) { Calculate(this.UpDict, ref result, wordIndexes); } else { CalculateWithPosition(this.UpDict, ref result, wordIndexes); } } return(result); }
unsafe private void Calculate(DocumentResultWhereDictionary upDict, ref DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { Array.Sort(wordIndexes); MinResultCount = _DBProvider.Table.GroupByLimit; //Get max word doc list count int minWordDocListCount = 1 * 1024 * 1024; //1M foreach (WordIndexForQuery wifq in wordIndexes) { minWordDocListCount = Math.Min(minWordDocListCount, wifq.WordIndex.WordDocList.Count); } if (docIdRank.Count == 0) { if (minWordDocListCount > DocumentResultWhereDictionary.DefaultSize) { docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(minWordDocListCount); } } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); //Merge bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1; int oneWordMaxCount = 0; if (oneWordOptimize) { //One word WordIndexForQuery wifq = wordIndexes[0]; //first word //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); int j = 0; while (docList.DocumentId >= 0) { //Entity.DocumentPositionList docList = wifq.WordIndex[j]; Core.SFQL.Parse.DocumentResultPoint drp; drp.pDocumentResult = null; if (j > MinResultCount) { if (oneWordMaxCount > docList.Count) { j++; docList = wifq.WordIndex.GetNext(); continue; } } else { if (oneWordMaxCount < docList.Count) { oneWordMaxCount = docList.Count; } } long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } if (upDict == null) { docIdRank.Add(docList.DocumentId, score); } else { if (!upDict.Not) { if (upDict.ContainsKey(docList.DocumentId)) { docIdRank.Add(docList.DocumentId, score); } } else { if (!upDict.ContainsKey(docList.DocumentId)) { docIdRank.Add(docList.DocumentId, score); } } } j++; docList = wifq.WordIndex.GetNext(); } } else { int wordIndexesLen = wordIndexes.Length; WordIndexForQuery fstWifq = wordIndexes[0]; //first word Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext(); Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen]; docListArr[0] = fstDocList; while (fstDocList.DocumentId >= 0) { int curWord = 1; int firstDocId = fstDocList.DocumentId; while (curWord < wordIndexesLen) { docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId); if (docListArr[curWord].DocumentId < 0) { break; } curWord++; } //While if (curWord >= wordIndexesLen) { //Matched long totalScore = 0; for (int i = 0; i < wordIndexesLen; i++) { WordIndexForQuery wifq = wordIndexes[i]; Entity.DocumentPositionList docList = docListArr[i]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } totalScore += score; } if (upDict == null) { docIdRank.Add(firstDocId, totalScore); } else { if (!upDict.Not) { if (upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } else { if (!upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } } } fstDocList = fstWifq.WordIndex.GetNext(); docListArr[0] = fstDocList; } } //Merge score if upDict != null if (upDict != null) { if (!upDict.Not) { foreach (int docid in docIdRank.Keys) { DocumentResult *upDrp; if (upDict.TryGetValue(docid, out upDrp)) { DocumentResult *drpResult; if (docIdRank.TryGetValue(docid, out drpResult)) { drpResult->Score += upDrp->Score; } } } } } DeleteProvider delProvider = _DBProvider.DelProvider; int delCount = delProvider.Filter(docIdRank); if (oneWordOptimize && _QueryParameter.CanLoadPartOfDocs && upDict == null) { docIdRank.RelTotalCount = wordIndexes[0].RelTotalCount - delCount; } else { docIdRank.RelTotalCount = docIdRank.Count; } performanceReport.Stop(); }
unsafe private void CalculateWithPosition(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { if (UseMatch(wordIndexes)) { CalculateWithPositionMatch(upDict, ref docIdRank, wordIndexes); return; } Array.Sort(wordIndexes); AdjustSort(wordIndexes); MinResultCount = _DBProvider.Table.GroupByLimit; //Get max word doc list count int minWordDocListCount = 1 * 1024 * 1024; //1M foreach (WordIndexForQuery wifq in wordIndexes) { minWordDocListCount = Math.Min(minWordDocListCount, wifq.WordIndex.Count); } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); //Merge bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) && wordIndexes.Length == 1 && _NotInDict == null && _QueryParameter.End >= 0 && !_QueryParameter.NeedDistinct; if (oneWordOptimize) { IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(OneWordOptimize), DBProvider, _QueryParameter.End, _QueryParameter.OrderBy, _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy, _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider), _QueryParameter.UntokenizedTreeOnRoot, wordIndexes); try { qOptimize.CalculateOptimize(upDict, ref docIdRank); return; } finally { performanceReport.Stop(); } } if (this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) && _NotInDict == null && _QueryParameter.End >= 0 && !_QueryParameter.NeedDistinct) { IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(ContainsOptimize), DBProvider, _QueryParameter.End, _QueryParameter.OrderBy, _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy, _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider), _QueryParameter.UntokenizedTreeOnRoot, wordIndexes); try { qOptimize.CalculateOptimize(upDict, ref docIdRank); return; } finally { performanceReport.Stop(); } //if (qOptimize.Argument.IsOrderByScoreDesc()) //{ //} } if (docIdRank.Count == 0) { if (minWordDocListCount > DocumentResultWhereDictionary.DefaultSize) { docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(minWordDocListCount); } } { double ratio = 1; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } int wordIndexesLen = wordIndexes.Length; WordIndexForQuery fstWifq = wordIndexes[0]; //first word OriginalDocumentPositionList fstODPL = new OriginalDocumentPositionList(); fstWifq.WordIndex.GetNextOriginal(ref fstODPL); //Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext(); Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen]; //docListArr[0] = fstDocList; fstODPL.ToDocumentPositionList(ref docListArr[0]); OriginalDocumentPositionList odpl = new OriginalDocumentPositionList(); while (fstODPL.DocumentId >= 0) { int curWord = 1; int firstDocId = fstODPL.DocumentId; while (curWord < wordIndexesLen) { //docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId); wordIndexes[curWord].WordIndex.GetNextOriginalWithDocId(ref odpl, firstDocId); odpl.ToDocumentPositionList(ref docListArr[curWord]); if (docListArr[curWord].DocumentId < 0) { if ((wordIndexes[curWord].Flags & WordInfo.Flag.Or) != 0) { curWord++; continue; } else { break; } } curWord++; } //While if (curWord >= wordIndexesLen) { //Matched //Caculate score long totalScore = 0; Entity.DocumentPositionList lastDocList = new Hubble.Core.Entity.DocumentPositionList(); for (int i = 0; i < wordIndexesLen; i++) { WordIndexForQuery wifq = wordIndexes[i]; if (wifq.WordIndex.Count == 0) { //a^5000^0 b^5000^2^1 //if has a and hasn't b but b can be or //2010-09-30 eaglet continue; } Entity.DocumentPositionList docList = docListArr[i]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } double delta = 1; if (i > 0) { //Calculate with position double queryPositionDelta = wifq.FirstPosition - wordIndexes[i - 1].FirstPosition; double positionDelta = docList.FirstPosition - lastDocList.FirstPosition; delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * lastDocList.Count / (double)(wifq.QueryCount * wordIndexes[i - 1].QueryCount); } lastDocList = docList; totalScore += (long)(score * delta); } bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(firstDocId)) { notInDict = true; } } if (!notInDict) { if (upDict == null) { docIdRank.Add(firstDocId, totalScore); } else { if (!upDict.Not) { if (upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } else { if (!upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } } } }//if (curWord >= wordIndexesLen) //fstDocList = fstWifq.WordIndex.GetNext(); //docListArr[0] = fstDocList; fstWifq.WordIndex.GetNextOriginal(ref fstODPL); fstODPL.ToDocumentPositionList(ref docListArr[0]); } } //Merge score if upDict != null if (upDict != null) { if (!upDict.Not) { foreach (int docid in docIdRank.Keys) { DocumentResult *upDrp; if (upDict.TryGetValue(docid, out upDrp)) { DocumentResult *drpResult; if (docIdRank.TryGetValue(docid, out drpResult)) { drpResult->Score += upDrp->Score; } } } } } DeleteProvider delProvider = _DBProvider.DelProvider; int delCount = delProvider.Filter(docIdRank); if (oneWordOptimize && _QueryParameter.CanLoadPartOfDocs && upDict == null) { docIdRank.RelTotalCount = wordIndexes[0].RelTotalCount - delCount; } else { docIdRank.RelTotalCount = docIdRank.Count; } performanceReport.Stop(); }
unsafe private void CalculateWithPositionMatch(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { Array.Sort(wordIndexes); MinResultCount = _DBProvider.Table.GroupByLimit; double ratio = 1; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } //Get max word doc list count int maxWordDocListCount = 0; int documentSum = 0; foreach (WordIndexForQuery wifq in wordIndexes) { maxWordDocListCount += wifq.WordIndex.Count; } maxWordDocListCount += maxWordDocListCount / 2; if (maxWordDocListCount > 1024 * 1024) { maxWordDocListCount = 1024 * 1024; } if (docIdRank.Count == 0) { if (maxWordDocListCount > DocumentResultWhereDictionary.DefaultSize) { docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(maxWordDocListCount); } } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); //Merge bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1; for (int i = 0; i < wordIndexes.Length; i++) { WordIndexForQuery wifq = wordIndexes[i]; //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); int j = 0; int oneWordMaxCount = 0; while (docList.DocumentId >= 0) { Core.SFQL.Parse.DocumentResultPoint drp; drp.pDocumentResult = null; if (oneWordOptimize) { if (j > MinResultCount) { if (oneWordMaxCount > docList.Count) { docList = wifq.WordIndex.GetNext(); j++; continue; } } else { if (oneWordMaxCount < docList.Count) { oneWordMaxCount = docList.Count; } } } if (j > wifq.RelTotalCount) { break; } long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } bool exits = drp.pDocumentResult != null; if (!exits && i > 0) { exits = docIdRank.TryGetValue(docList.DocumentId, out drp); } if (exits) { drp.pDocumentResult->Score += score; double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition; double positionDelta = docList.FirstPosition - drp.pDocumentResult->LastPosition; double delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount / (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount); //some words missed //if (i - drp.pDocumentResult->LastIndex > 1) //{ // int sumWordRank = 10; // for (int k = drp.pDocumentResult->LastIndex + 1; k < i; k++) // { // sumWordRank += wordIndexes[k].WordRank; // } // delta /= (double)sumWordRank; //} drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta); drp.pDocumentResult->LastIndex = (UInt16)i; drp.pDocumentResult->LastPosition = docList.FirstPosition; drp.pDocumentResult->LastCount = (UInt16)docList.Count; drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition; } else { //some words missed //if (i > 0) //{ // int sumWordRank = 10; // for (int k = 0; k < i; k++) // { // sumWordRank += wordIndexes[k].WordRank; // } // double delta = 1 / (double)sumWordRank; // score = (long)(score * delta); //} bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(docList.DocumentId)) { notInDict = true; } } if (!notInDict) { if (upDict == null) { DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i); docIdRank.Add(docList.DocumentId, docResult); } else { if (!upDict.Not) { if (upDict.ContainsKey(docList.DocumentId)) { DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i); docIdRank.Add(docList.DocumentId, docResult); } } else { if (!upDict.ContainsKey(docList.DocumentId)) { DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i); docIdRank.Add(docList.DocumentId, docResult); } } } } } docList = wifq.WordIndex.GetNext(); j++; if (j > wifq.WordIndex.Count) { break; } } } //Merge score if upDict != null if (upDict != null) { if (!upDict.Not) { foreach (int docid in docIdRank.Keys) { DocumentResult *upDrp; if (upDict.TryGetValue(docid, out upDrp)) { DocumentResult *drpResult; if (docIdRank.TryGetValue(docid, out drpResult)) { drpResult->Score += upDrp->Score; } } } } } //some words missed //if (wordIndexes.Length > 1) //{ // List<DocumentResult> reduceDocs = new List<DocumentResult>(docIdRank.Count); // int lstIndex = wordIndexes.Length - 1; // foreach (Core.SFQL.Parse.DocumentResultPoint drp in docIdRank.Values) // { // DocumentResult* dr = drp.pDocumentResult; // //DocumentResult* dr1 = drp.pDocumentResult; // if (dr->LastIndex != lstIndex) // { // int sumWordRank = 10; // for (int k = dr->LastIndex + 1; k <= lstIndex; k++) // { // sumWordRank += wordIndexes[k].WordRank; // } // double delta = 1 / (double)sumWordRank; // dr->Score = (long)((double)dr->Score * delta); // } // if (dr->Score < 0) // { // dr->Score = long.MaxValue / 10; // } // } //} performanceReport.Stop(); documentSum += docIdRank.Count; if (documentSum > _TotalDocuments) { documentSum = _TotalDocuments; } DeleteProvider delProvider = _DBProvider.DelProvider; int deleteCount = delProvider.Filter(docIdRank); if (_QueryParameter.CanLoadPartOfDocs && upDict == null) { if (docIdRank.Count < wordIndexes[wordIndexes.Length - 1].RelTotalCount) { if (wordIndexes.Length > 1) { if (wordIndexes[wordIndexes.Length - 1].RelTotalCount > _DBProvider.MaxReturnCount) { documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount - _DBProvider.MaxReturnCount; } if (documentSum > _TotalDocuments) { documentSum = _TotalDocuments; } docIdRank.RelTotalCount = documentSum; } else { docIdRank.RelTotalCount = wordIndexes[wordIndexes.Length - 1].RelTotalCount; } } } docIdRank.RelTotalCount -= deleteCount; }
public Core.SFQL.Parse.DocumentResultWhereDictionary GetDocumentResults(int end, string where, string orderby) { string sql; if (end >= 0) { sql = string.Format("select top {0} ", end + 1); } else { sql = "select "; } if (string.IsNullOrEmpty(where)) { if (DocIdReplaceField == null) { sql += string.Format(" docid from [{0}] ", Table.DBTableName); } else { sql += string.Format(" [{0}] from [{1}] ", DocIdReplaceField, Table.DBTableName); } } else { if (DocIdReplaceField == null) { sql += string.Format(" docid from [{0}] where {1}", Table.DBTableName, where); } else { sql += string.Format(" [{0}] from [{1}] where {2}", DocIdReplaceField, Table.DBTableName, where); } } if (!string.IsNullOrEmpty(orderby)) { sql += " order by " + orderby; } Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary(); using (SQLDataProvider sqlData = new SQLDataProvider()) { sqlData.Connect(Table.ConnectionString); foreach (System.Data.DataRow row in sqlData.QuerySql(sql).Tables[0].Rows) { int docId; if (DocIdReplaceField == null) { docId = int.Parse(row[0].ToString()); } else { docId = DBProvider.GetDocIdFromDocIdReplaceFieldValue(long.Parse(row[DocIdReplaceField].ToString())); if (docId < 0) { continue; } } result.Add(docId, new Hubble.Core.Query.DocumentResult(docId)); } System.Data.DataSet ds; if (string.IsNullOrEmpty(where)) { ds = sqlData.QuerySql(string.Format("select count(*) cnt from {0}", Table.DBTableName)); } else { ds = sqlData.QuerySql(string.Format("select count(*) cnt from {0} where {1}", Table.DBTableName, where)); } result.RelTotalCount = int.Parse(ds.Tables[0].Rows[0][0].ToString()); } return(result); }
unsafe private void Calculate(DocumentResultWhereDictionary upDict, ref DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { Array.Sort(wordIndexes); MinResultCount = _DBProvider.Table.GroupByLimit; //Get max word doc list count int maxWordDocListCount = 0; int documentSum = 0; foreach (WordIndexForQuery wifq in wordIndexes) { maxWordDocListCount += wifq.WordIndex.RelDocCount; } if (docIdRank.Count == 0) { if (maxWordDocListCount > DocumentResultWhereDictionary.DefaultSize) { docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(maxWordDocListCount); } } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); //Merge bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1; for (int i = 0; i < wordIndexes.Length; i++) { WordIndexForQuery wifq = wordIndexes[i]; //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); int j = 0; int oneWordMaxCount = 0; while (docList.DocumentId >= 0) { //Entity.DocumentPositionList docList = wifq.WordIndex[j]; Core.SFQL.Parse.DocumentResultPoint drp; drp.pDocumentResult = null; if (oneWordOptimize) { if (j > MinResultCount) { if (j > MinResultCount) { if (oneWordMaxCount > docList.Count) { docList = wifq.WordIndex.GetNext(); j++; continue; } } else { if (oneWordMaxCount < docList.Count) { oneWordMaxCount = docList.Count; } } } } long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } bool exits = drp.pDocumentResult != null; if (!exits && i > 0) { exits = docIdRank.TryGetValue(docList.DocumentId, out drp); } if (exits) { drp.pDocumentResult->Score += score; } else { bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(docList.DocumentId)) { notInDict = true; } } if (!notInDict) { if (upDict == null) { docIdRank.Add(docList.DocumentId, score); } else { if (!upDict.Not) { if (upDict.ContainsKey(docList.DocumentId)) { docIdRank.Add(docList.DocumentId, score); } } else { if (!upDict.ContainsKey(docList.DocumentId)) { docIdRank.Add(docList.DocumentId, score); } } } } } docList = wifq.WordIndex.GetNext(); j++; } } //Merge score if upDict != null if (upDict != null) { if (!upDict.Not) { foreach (int docid in docIdRank.Keys) { DocumentResult *upDrp; if (upDict.TryGetValue(docid, out upDrp)) { DocumentResult *drpResult; if (docIdRank.TryGetValue(docid, out drpResult)) { drpResult->Score += upDrp->Score; } } } } } documentSum += docIdRank.Count; if (documentSum > _TotalDocuments) { documentSum = _TotalDocuments; } DeleteProvider delProvider = _DBProvider.DelProvider; int deleteCount = delProvider.Filter(docIdRank); if (_QueryParameter.CanLoadPartOfDocs && upDict == null) { if (docIdRank.Count < wordIndexes[wordIndexes.Length - 1].RelTotalCount) { if (wordIndexes.Length > 1) { if (wordIndexes[wordIndexes.Length - 1].RelTotalCount > _DBProvider.MaxReturnCount) { documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount - _DBProvider.MaxReturnCount; } if (documentSum > _TotalDocuments) { documentSum = _TotalDocuments; } docIdRank.RelTotalCount = documentSum; } else { docIdRank.RelTotalCount = wordIndexes[wordIndexes.Length - 1].RelTotalCount; } } } docIdRank.RelTotalCount -= deleteCount; performanceReport.Stop(); }
unsafe private void CalculateWithPositionOrderByScoreDesc(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { DBProvider dbProvider = Argument.DBProvider; bool needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions; ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot; if (upDict != null) { throw new ParseException("UpDict is not null!"); } //Calculate top int top; if (Argument.End >= 0) { top = (1 + Argument.End / 100) * 100; if (top <= 0) { top = 100; } } else { top = int.MaxValue; } PriorQueue <Docid2Long> priorQueue = null; List <Docid2Long> docid2longList = null; if (top == int.MaxValue) { docid2longList = new List <Docid2Long>(); } else { priorQueue = new PriorQueue <Docid2Long>(top, new DocIdLongComparer(false)); } long lastMinScore = 0; int rows = 0; Core.SFQL.Parse.DocumentResultWhereDictionary groupByDict = Argument.NeedGroupBy ? docIdRank : null; MultiWordsDocIdEnumerator mwde = new MultiWordsDocIdEnumerator(wordIndexes, dbProvider, groupByDict, -1, needFilterUntokenizedConditions); //Changed at 2012-3-18, top optimize will effect search result, disable it. //MultiWordsDocIdEnumerator mwde = new MultiWordsDocIdEnumerator(wordIndexes, dbProvider, groupByDict, top, // needFilterUntokenizedConditions); Entity.OriginalDocumentPositionList odpl = new Hubble.Core.Entity.OriginalDocumentPositionList(); mwde.GetNextOriginal(ref odpl); Entity.DocumentPositionList lastDocList = new Hubble.Core.Entity.DocumentPositionList(); double ratio = 1; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } Query.DocumentResult documentResult; Query.DocumentResult *drp = &documentResult; int skipCount = 0; //skip by filter untokenized conditions while (odpl.DocumentId >= 0) { //Process untokenized conditions. //If is not matched, get the next one. if (needFilterUntokenizedConditions) { int docId = odpl.DocumentId; drp->DocId = docId; drp->PayloadData = dbProvider.GetPayloadDataWithShareLock(docId); if (!ParseWhere.GetComparisionExpressionValue(dbProvider, drp, untokenizedTree)) { mwde.GetNextOriginal(ref odpl); skipCount++; continue; } } //Matched //Caculate score #region Caclate score long totalScore = 0; lastDocList.Count = 0; lastDocList.FirstPosition = 0; int lastWifqIndex = 0; for (int i = 0; i < mwde.SelectedCount; i++) { int index = mwde.SelectedIndexes[i]; WordIndexForQuery wifq = mwde.WordIndexes[index]; Int16 count = (Int16)mwde.SelectedDocLists[i].Count; int firstPosition = mwde.SelectedDocLists[i].FirstPosition; int totalWordsInThisDocument = mwde.SelectedDocLists[i].TotalWordsInThisDocument; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)count * (long)1000000 / ((long)wifq.Sum_d_t * (long)totalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } double delta = 1; if (i > 0) { //Calculate with position double queryPositionDelta = wifq.FirstPosition - wordIndexes[lastWifqIndex].FirstPosition; double positionDelta = firstPosition - lastDocList.FirstPosition; delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * count * lastDocList.Count / (double)(wifq.QueryCount * wordIndexes[lastWifqIndex].QueryCount); } lastDocList.Count = count; lastDocList.FirstPosition = firstPosition; lastWifqIndex = index; totalScore += (long)(score * delta); } //End of score calculation if (_HasRankField) { int rank = dbProvider.SharedPayloadProvider.GetPayloadRank(odpl.DocumentId); totalScore *= rank; if (totalScore < 0) { totalScore = long.MaxValue - 4000000; } } //all of the words matched //10 times if (mwde.SelectedCount == wordIndexes.Length) { totalScore *= 10; if (totalScore < 0) { totalScore = long.MaxValue - 4000000; } } #endregion //Insert to prior queue if (rows >= top) { if (lastMinScore < totalScore) { priorQueue.Add(new Docid2Long(odpl.DocumentId, totalScore)); lastMinScore = priorQueue.Last.Value1; } } else { if (top == int.MaxValue) { docid2longList.Add(new Docid2Long(odpl.DocumentId, totalScore)); } else { priorQueue.Add(new Docid2Long(odpl.DocumentId, totalScore)); rows++; if (rows == top) { lastMinScore = priorQueue.Last.Value1; } } } mwde.GetNextOriginal(ref odpl); } docIdRank.RelTotalCount = mwde.TotalDocIdCount - skipCount; Docid2Long[] docid2longArr; if (top == int.MaxValue) { docid2longList.Sort(new DocIdLongComparer(false)); docid2longArr = docid2longList.ToArray(); } else { docid2longArr = priorQueue.ToArray(); } foreach (Docid2Long docid2Long in docid2longArr) { long score = docid2Long.Value1; if (score < 0) { //Overflow score = long.MaxValue - 4000000; } docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score)); } docIdRank.Sorted = true; }
/// <summary> /// /// </summary> /// <param name="docIdResult">docid result dictionary</param> /// <returns>Delete count</returns> public int Filter(Core.SFQL.Parse.DocumentResultWhereDictionary docIdResult) { lock (this) { int deleteCount = 0; if (_DeleteTbl.Count <= 0) { return(0); } bool hasGroupByRecords = docIdResult.GroupByCollection.Count > 0; if (_DeleteTbl.Count < docIdResult.Count) { foreach (int docid in _DeleteTbl.Keys) { if (docIdResult.ContainsKey(docid)) { deleteCount++; docIdResult.Remove(docid); } if (hasGroupByRecords) { docIdResult.RemoveFromGroupByCollection(docid); //if (docIdResult.GroupByContains(docid)) //{ // docIdResult.RemoveFromGroupByCollection(docid); //} } } if (hasGroupByRecords) { docIdResult.CompreassGroupByCollection(); } } else { List <int> deleDocIdList = new List <int>(); foreach (int docid in docIdResult.Keys) { if (_DeleteTbl.ContainsKey(docid)) { deleDocIdList.Add(docid); } } foreach (int docid in deleDocIdList) { if (docIdResult.Remove(docid)) { deleteCount++; } if (hasGroupByRecords) { docIdResult.RemoveFromGroupByCollection(docid); } } if (hasGroupByRecords) { docIdResult.CompreassGroupByCollection(); } deleDocIdList = null; } return(deleteCount); } }
/// <summary> /// Order by score desc /// and only one expression in the banch of expression tree. /// and more than two words /// </summary> /// <param name="upDict"></param> /// <param name="docIdRank"></param> /// <param name="wordIndexes"></param> unsafe private void CalculateWithPositionOrderByScoreDesc11(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { if (upDict != null) { throw new ParseException("UpDict is not null!"); } Array.Sort(wordIndexes); //Calculate top int top; if (this._QueryParameter.End >= 0) { top = (1 + this._QueryParameter.End / 100) * 100; if (top <= 0) { top = 100; } //if (this._QueryParameter.End * 2 > top) //{ // top *= 2; //} } else { top = int.MaxValue; } double ratio = 1; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } //Get max word doc list count int maxWordDocListCount = 0; int documentSum = 0; foreach (WordIndexForQuery wifq in wordIndexes) { maxWordDocListCount += wifq.WordIndex.Count; } maxWordDocListCount += maxWordDocListCount / 2; if (maxWordDocListCount > 1024 * 1024) { maxWordDocListCount = 1024 * 1024; } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); bool groupbyScanAll = false; //Match for group by if (this._QueryParameter.NeedGroupBy) { groupbyScanAll = true; int groupbyContainsCount = 0; int groupbyLimit = _DBProvider.Table.GroupByLimit; BitSet bitSet = new BitSet(); for (int i = 0; i < wordIndexes.Length; i++) { WordIndexForQuery wifq = wordIndexes[i]; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); while (docList.DocumentId >= 0) { if (bitSet.ForceAdd(docList.DocumentId)) { groupbyContainsCount++; } if (groupbyContainsCount >= groupbyLimit) { groupbyScanAll = false; break; } docList = wifq.WordIndex.GetNext(); } wifq.WordIndex.Reset(); if (!groupbyScanAll) { break; } } AscIntList groupByCollect = new AscIntList(); groupByCollect.AddRange(bitSet); } //Merge int indexInTop = 0; for (int i = 0; i < wordIndexes.Length; i++) { if (docIdRank.Count >= top) { break; } indexInTop = i; WordIndexForQuery wifq = wordIndexes[i]; //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); int j = 0; while (docList.DocumentId >= 0) { Core.SFQL.Parse.DocumentResultPoint drp; drp.pDocumentResult = null; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } else { switch (i) { case 0: score *= 20; break; case 1: score *= 4; break; case 2: score *= 1; break; case 3: score /= 2; break; default: score /= i; break; } } if (score < 0) { //Overflow score = long.MaxValue - 4000000; } bool exits = drp.pDocumentResult != null; if (!exits && i > 0) { exits = docIdRank.TryGetValue(docList.DocumentId, out drp); } if (exits) { drp.pDocumentResult->Score += score; drp.pDocumentResult->HitCount++; double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition; double positionDelta = docList.FirstPosition - drp.pDocumentResult->LastPosition; double delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount / (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount); drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta); //Overflow, if match too much, sometime score would less than zero. if (drp.pDocumentResult->Score < 0) { drp.pDocumentResult->Score = long.MaxValue - 4000000; } drp.pDocumentResult->LastIndex = (UInt16)i; drp.pDocumentResult->LastPosition = docList.FirstPosition; drp.pDocumentResult->LastCount = (UInt16)docList.Count; drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition; } else { bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(docList.DocumentId)) { notInDict = true; } } if (!notInDict) { //upDict is null in this function DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i); docIdRank.Add(docList.DocumentId, docResult); } } docList = wifq.WordIndex.GetNext(); j++; if (j > wifq.WordIndex.Count) { break; } } } long maxScoreValue = 0; //Max score value of the docid that hit count less than wordIndexes.Length int wordIndexesLen = wordIndexes.Length; //Get the max score value of the docs that hit count less than wordIndexes.Length foreach (DocumentResultPoint docResult in docIdRank.Values) { if (docResult.pDocumentResult->HitCount < wordIndexesLen) { if (docResult.pDocumentResult->Score > maxScoreValue) { maxScoreValue = docResult.pDocumentResult->Score; } } } double hitRate = 0; if (indexInTop < wordIndexes.Length - 1) { int[] docidlist = new int[docIdRank.Count]; int i = 0; foreach (int docid in docIdRank.Keys) { docidlist[i] = docid; i++; } Array.Sort(docidlist); int lastWordHitCount = 0; foreach (int firstDocId in docidlist) { int curWord = indexInTop + 1; Core.SFQL.Parse.DocumentResultPoint drp; if (docIdRank.TryGetValue(firstDocId, out drp)) { while (curWord < wordIndexesLen) { Entity.DocumentPositionList docList = wordIndexes[curWord].WordIndex.Get(firstDocId); int curDocId = docList.DocumentId; if (curDocId >= 0) { drp.pDocumentResult->HitCount++; if (curWord == wordIndexesLen - 1) { lastWordHitCount++; } WordIndexForQuery wifq = wordIndexes[curWord]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } else { switch (curWord) { case 0: score *= 20; break; case 1: score *= 4; break; case 2: score *= 1; break; case 3: score /= 2; break; default: score /= curWord; break; } } if (score < 0) { //Overflow score = long.MaxValue - 4000000; } drp.pDocumentResult->Score += score; double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition; double positionDelta = docList.FirstPosition - drp.pDocumentResult->LastPosition; double delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount / (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount); drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta); //Overflow, if match too much, sometime score would less than zero. if (drp.pDocumentResult->Score < 0) { drp.pDocumentResult->Score = long.MaxValue - 4000000; } drp.pDocumentResult->LastIndex = (UInt16)curWord; drp.pDocumentResult->LastPosition = docList.FirstPosition; drp.pDocumentResult->LastCount = (UInt16)docList.Count; drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition; } curWord++; } //While if (drp.pDocumentResult->HitCount < wordIndexesLen) { if (drp.pDocumentResult->Score > maxScoreValue) { maxScoreValue = drp.pDocumentResult->Score; } } } } if (docidlist.Length > 0) { hitRate = (double)lastWordHitCount / (double)docidlist.Length; } } //Adjust score of the docs that hit count equal wordIndexes.Length foreach (DocumentResultPoint docResult in docIdRank.Values) { if (docResult.pDocumentResult->HitCount == wordIndexesLen) { docResult.pDocumentResult->Score += maxScoreValue; if (docResult.pDocumentResult->Score < 0) { docResult.pDocumentResult->Score = long.MaxValue; } } } performanceReport.Stop(); documentSum += docIdRank.Count; if (indexInTop < wordIndexes.Length - 1) { documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount; if (hitRate > 0) { int predictCount = 0; for (int i = indexInTop + 1; i < wordIndexes.Length - 1; i++) { predictCount += (int)(wordIndexes[i].RelTotalCount * (1 - hitRate)); } documentSum += predictCount; } } if (documentSum > _TotalDocuments) { documentSum = _TotalDocuments; } docIdRank.RelTotalCount = documentSum; DeleteProvider delProvider = _DBProvider.DelProvider; int deleteCount = delProvider.Filter(docIdRank); docIdRank.RelTotalCount -= deleteCount; if (groupbyScanAll) { docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count; } else if (docIdRank.GroupByCollection.Count > docIdRank.RelTotalCount) { docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count; } }
public Core.SFQL.Parse.DocumentResultWhereDictionary Search() { Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Search of Match"); Init(); Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary(); if (_QueryWords.Count <= 0 || _WordIndexes.Length <= 0) { if (_QueryParameter.Not && UpDict != null) { return(UpDict); } else { return(result); } } if (this._QueryParameter.Not) { if (_InvertedIndex.IndexMode == Field.IndexMode.Simple) { Calculate(null, ref result, _WordIndexes); } else { //For not, we have to return all of the records //Modified at 18 Jan 2012 CalculateWithPosition(null, ref result, _WordIndexes); //if (Optimize.OptimizeArgument.IsOrderByScoreDesc(this._QueryParameter.OrderBys) && // !this._QueryParameter.NeedDistinct && this._QueryParameter.CanLoadPartOfDocs // && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) // && _WordIndexes.Length > 1 && this.UpDict == null) //{ // CalculateWithPositionOrderByScoreDesc(null, ref result, _WordIndexes); //} //else //{ // CalculateWithPosition(null, ref result, _WordIndexes); //} } } else { if (_InvertedIndex.IndexMode == Field.IndexMode.Simple) { Calculate(this.UpDict, ref result, _WordIndexes); } else { if (//Optimize.OptimizeArgument.IsOrderByScoreDesc(this._QueryParameter.OrderBys) && !this._QueryParameter.NeedDistinct && this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) && _WordIndexes.Length > 1 && this.UpDict == null) { IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(MatchOptimize), DBProvider, _QueryParameter.End, _QueryParameter.OrderBy, _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy, _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider), _QueryParameter.UntokenizedTreeOnRoot, _WordIndexes); Query.PerformanceReport performanceReportCalculate = null; try { performanceReportCalculate = new Hubble.Core.Query.PerformanceReport("Calculate"); qOptimize.CalculateOptimize(this.UpDict, ref result); } finally { performanceReportCalculate.Stop(); } } else { CalculateWithPosition(this.UpDict, ref result, _WordIndexes); } } } if (this._QueryParameter.Not) { result.Not = true; if (UpDict != null) { result = result.AndMergeForNot(result, UpDict); } } performanceReport.Stop(); return(result); }
/// <summary> /// order by except only order by score desc. /// </summary> /// <param name="upDict"></param> /// <param name="docIdRank"></param> unsafe public void CalculateOptimizeNormalOrderBy(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank) { DBProvider dBProvider = Argument.DBProvider; Argument.DBProvider.SharedPayloadProvider.EnterPayloladShareLock(); bool needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions; ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot; Query.DocumentResult documentResult; Query.DocumentResult *drp = &documentResult; bool orderByIncludingScore = Argument.OrderByIncludingScore(); try { Field[] orderByFields; DocId2LongComparer comparer = DocId2LongComparer.Generate( dBProvider, Argument.OrderBys, out orderByFields); bool needGroupBy = Argument.NeedGroupBy; WordIndexForQuery wifq = WordIndexes[0]; _IndexReader = wifq.WordIndex.IndexReader; Data.Field rankField = Argument.DBProvider.GetField("Rank"); if (rankField != null) { if (rankField.DataType == Hubble.Core.Data.DataType.Int && rankField.IndexType == Hubble.Core.Data.Field.Index.Untokenized) { _HasRandField = true; _RankTab = rankField.TabIndex; _DocidPayloads = new OriginalDocumentPositionList[2 * 1024]; _CurDocidPayloadIndex = _DocidPayloads.Length; } } if (_IndexReader != null) { int top; //vars for delete bool haveRecordsDeleted = dBProvider.DelProvider.Count > 0; int[] delDocs = null; int curDelIndex = 0; int curDelDocid = 0; int groupByCount = 0; int groupByLen = dBProvider.Table.GroupByLimit; int groupByStep = 1; int groupByIndex = 0; if (needGroupBy) { groupByStep = wifq.RelTotalCount / groupByLen; if (groupByStep <= 0) { groupByStep = 1; } } if (haveRecordsDeleted) { delDocs = dBProvider.DelProvider.DelDocs; curDelDocid = delDocs[curDelIndex]; } try { //calculate top //If less than 100, set to 100 if (this.Argument.End >= 0) { top = (1 + this.Argument.End / 100) * 100; if (top <= 0) { top = 100; } //if (this.Argument.End * 2 > top) //{ // top *= 2; //} } else { top = int.MaxValue; } PriorQueue <Docid2Long> priorQueue = new PriorQueue <Docid2Long>(top, comparer); int rows = 0; Entity.OriginalDocumentPositionList docList = new OriginalDocumentPositionList(); bool notEOF = GetNext(ref docList); Index.WordIndexReader wordIndexReader = wifq.WordIndex; Docid2Long last = new Docid2Long(); last.DocId = -1; int relCount = 0; while (notEOF) { //Process untokenized conditions. //If is not matched, get the next one. if (needFilterUntokenizedConditions) { int docId = docList.DocumentId; drp->DocId = docId; drp->PayloadData = dBProvider.GetPayloadDataWithShareLock(docId); if (!ParseWhere.GetComparisionExpressionValue(dBProvider, drp, untokenizedTree)) { notEOF = GetNext(ref docList); continue; } } //Process deleted records if (haveRecordsDeleted) { if (curDelIndex < delDocs.Length) { //If docid deleted, get next if (docList.DocumentId == curDelDocid) { notEOF = GetNext(ref docList); continue; } else if (docList.DocumentId > curDelDocid) { while (curDelIndex < delDocs.Length && curDelDocid < docList.DocumentId) { curDelIndex++; if (curDelIndex >= delDocs.Length) { haveRecordsDeleted = false; break; } curDelDocid = delDocs[curDelIndex]; } if (curDelIndex < delDocs.Length) { if (docList.DocumentId == curDelDocid) { notEOF = GetNext(ref docList); continue; } } } } } if (needGroupBy) { if (groupByCount < groupByLen) { if (groupByIndex >= groupByStep) { groupByIndex = 0; } if (groupByIndex == 0) { docIdRank.AddToGroupByCollection(docList.DocumentId); groupByCount++; } groupByIndex++; } } relCount++; Docid2Long cur = new Docid2Long(); if (rows >= top) { long score = 1; if (orderByIncludingScore) { int wordCount = docList.CountAndWordCount / 8; //one word, score = count score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)wordCount * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); } cur.DocId = docList.DocumentId; cur.Rank = docList.TotalWordsInThisDocument; Docid2Long.Generate(ref cur, dBProvider, orderByFields, score); if (comparer.Compare(last, cur) > 0) { priorQueue.Add(cur); last = priorQueue.Last; } } else { long score = 1; if (orderByIncludingScore) { int wordCount = docList.CountAndWordCount / 8; //one word, score = count score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)wordCount * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); } if (score < 0) { //Overflow score = long.MaxValue - 4000000; } cur.DocId = docList.DocumentId; cur.Rank = docList.TotalWordsInThisDocument; Docid2Long.Generate(ref cur, dBProvider, orderByFields, score); priorQueue.Add(cur); rows++; if (rows == top) { last = priorQueue.Last; } } notEOF = GetNext(ref docList); } docIdRank.RelTotalCount = relCount; foreach (Docid2Long docid2Long in priorQueue.ToArray()) { long score = comparer.GetScore(docid2Long); //use Rank store TotalWordsInThisDocument if (score < 0) { //Overflow score = long.MaxValue - 4000000; } docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score)); } } finally { } docIdRank.Sorted = true; } } finally { Argument.DBProvider.SharedPayloadProvider.LeavePayloadShareLock(); } }