//public bool GroupByContains(int docId) //{ // if (_GroupByDict == null) // { // return false; // } // return _GroupByDict.Contains(docId); //} static public IList <int> MergeOr(IList <int> src, IList <int> dest) { AscIntList aSrc = (AscIntList)src; AscIntList aDest = (AscIntList)dest; return(AscIntList.MergeOr(aSrc, aDest)); }
public void AddToGroupByCollection(int docId) { if (_GroupByDict == null) { _GroupByDict = new AscIntList(); } _GroupByDict.Add(docId); }
/// <summary> /// Order by score desc /// and only one expression in the banch of expression tree. /// and more than two words /// </summary> /// <param name="upDict"></param> /// <param name="docIdRank"></param> /// <param name="wordIndexes"></param> unsafe private void CalculateWithPositionOrderByScoreDesc11(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { if (upDict != null) { throw new ParseException("UpDict is not null!"); } Array.Sort(wordIndexes); //Calculate top int top; if (this._QueryParameter.End >= 0) { top = (1 + this._QueryParameter.End / 100) * 100; if (top <= 0) { top = 100; } //if (this._QueryParameter.End * 2 > top) //{ // top *= 2; //} } else { top = int.MaxValue; } double ratio = 1; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } //Get max word doc list count int maxWordDocListCount = 0; int documentSum = 0; foreach (WordIndexForQuery wifq in wordIndexes) { maxWordDocListCount += wifq.WordIndex.Count; } maxWordDocListCount += maxWordDocListCount / 2; if (maxWordDocListCount > 1024 * 1024) { maxWordDocListCount = 1024 * 1024; } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); bool groupbyScanAll = false; //Match for group by if (this._QueryParameter.NeedGroupBy) { groupbyScanAll = true; int groupbyContainsCount = 0; int groupbyLimit = _DBProvider.Table.GroupByLimit; BitSet bitSet = new BitSet(); for (int i = 0; i < wordIndexes.Length; i++) { WordIndexForQuery wifq = wordIndexes[i]; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); while (docList.DocumentId >= 0) { if (bitSet.ForceAdd(docList.DocumentId)) { groupbyContainsCount++; } if (groupbyContainsCount >= groupbyLimit) { groupbyScanAll = false; break; } docList = wifq.WordIndex.GetNext(); } wifq.WordIndex.Reset(); if (!groupbyScanAll) { break; } } AscIntList groupByCollect = new AscIntList(); groupByCollect.AddRange(bitSet); } //Merge int indexInTop = 0; for (int i = 0; i < wordIndexes.Length; i++) { if (docIdRank.Count >= top) { break; } indexInTop = i; WordIndexForQuery wifq = wordIndexes[i]; //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); int j = 0; while (docList.DocumentId >= 0) { Core.SFQL.Parse.DocumentResultPoint drp; drp.pDocumentResult = null; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } else { switch (i) { case 0: score *= 20; break; case 1: score *= 4; break; case 2: score *= 1; break; case 3: score /= 2; break; default: score /= i; break; } } if (score < 0) { //Overflow score = long.MaxValue - 4000000; } bool exits = drp.pDocumentResult != null; if (!exits && i > 0) { exits = docIdRank.TryGetValue(docList.DocumentId, out drp); } if (exits) { drp.pDocumentResult->Score += score; drp.pDocumentResult->HitCount++; double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition; double positionDelta = docList.FirstPosition - drp.pDocumentResult->LastPosition; double delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount / (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount); drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta); //Overflow, if match too much, sometime score would less than zero. if (drp.pDocumentResult->Score < 0) { drp.pDocumentResult->Score = long.MaxValue - 4000000; } drp.pDocumentResult->LastIndex = (UInt16)i; drp.pDocumentResult->LastPosition = docList.FirstPosition; drp.pDocumentResult->LastCount = (UInt16)docList.Count; drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition; } else { bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(docList.DocumentId)) { notInDict = true; } } if (!notInDict) { //upDict is null in this function DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i); docIdRank.Add(docList.DocumentId, docResult); } } docList = wifq.WordIndex.GetNext(); j++; if (j > wifq.WordIndex.Count) { break; } } } long maxScoreValue = 0; //Max score value of the docid that hit count less than wordIndexes.Length int wordIndexesLen = wordIndexes.Length; //Get the max score value of the docs that hit count less than wordIndexes.Length foreach (DocumentResultPoint docResult in docIdRank.Values) { if (docResult.pDocumentResult->HitCount < wordIndexesLen) { if (docResult.pDocumentResult->Score > maxScoreValue) { maxScoreValue = docResult.pDocumentResult->Score; } } } double hitRate = 0; if (indexInTop < wordIndexes.Length - 1) { int[] docidlist = new int[docIdRank.Count]; int i = 0; foreach (int docid in docIdRank.Keys) { docidlist[i] = docid; i++; } Array.Sort(docidlist); int lastWordHitCount = 0; foreach (int firstDocId in docidlist) { int curWord = indexInTop + 1; Core.SFQL.Parse.DocumentResultPoint drp; if (docIdRank.TryGetValue(firstDocId, out drp)) { while (curWord < wordIndexesLen) { Entity.DocumentPositionList docList = wordIndexes[curWord].WordIndex.Get(firstDocId); int curDocId = docList.DocumentId; if (curDocId >= 0) { drp.pDocumentResult->HitCount++; if (curWord == wordIndexesLen - 1) { lastWordHitCount++; } WordIndexForQuery wifq = wordIndexes[curWord]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } else { switch (curWord) { case 0: score *= 20; break; case 1: score *= 4; break; case 2: score *= 1; break; case 3: score /= 2; break; default: score /= curWord; break; } } if (score < 0) { //Overflow score = long.MaxValue - 4000000; } drp.pDocumentResult->Score += score; double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition; double positionDelta = docList.FirstPosition - drp.pDocumentResult->LastPosition; double delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount / (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount); drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta); //Overflow, if match too much, sometime score would less than zero. if (drp.pDocumentResult->Score < 0) { drp.pDocumentResult->Score = long.MaxValue - 4000000; } drp.pDocumentResult->LastIndex = (UInt16)curWord; drp.pDocumentResult->LastPosition = docList.FirstPosition; drp.pDocumentResult->LastCount = (UInt16)docList.Count; drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition; } curWord++; } //While if (drp.pDocumentResult->HitCount < wordIndexesLen) { if (drp.pDocumentResult->Score > maxScoreValue) { maxScoreValue = drp.pDocumentResult->Score; } } } } if (docidlist.Length > 0) { hitRate = (double)lastWordHitCount / (double)docidlist.Length; } } //Adjust score of the docs that hit count equal wordIndexes.Length foreach (DocumentResultPoint docResult in docIdRank.Values) { if (docResult.pDocumentResult->HitCount == wordIndexesLen) { docResult.pDocumentResult->Score += maxScoreValue; if (docResult.pDocumentResult->Score < 0) { docResult.pDocumentResult->Score = long.MaxValue; } } } performanceReport.Stop(); documentSum += docIdRank.Count; if (indexInTop < wordIndexes.Length - 1) { documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount; if (hitRate > 0) { int predictCount = 0; for (int i = indexInTop + 1; i < wordIndexes.Length - 1; i++) { predictCount += (int)(wordIndexes[i].RelTotalCount * (1 - hitRate)); } documentSum += predictCount; } } if (documentSum > _TotalDocuments) { documentSum = _TotalDocuments; } docIdRank.RelTotalCount = documentSum; DeleteProvider delProvider = _DBProvider.DelProvider; int deleteCount = delProvider.Filter(docIdRank); docIdRank.RelTotalCount -= deleteCount; if (groupbyScanAll) { docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count; } else if (docIdRank.GroupByCollection.Count > docIdRank.RelTotalCount) { docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count; } }