unsafe private void Calculate(DocumentResultWhereDictionary upDict, ref DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { Array.Sort(wordIndexes); AdjustSort(wordIndexes); MinResultCount = _DBProvider.Table.GroupByLimit; //Get max word doc list count int minWordDocListCount = 1 * 1024 * 1024; //1M foreach (WordIndexForQuery wifq in wordIndexes) { minWordDocListCount = Math.Min(minWordDocListCount, wifq.WordIndex.Count); } if (docIdRank.Count == 0) { if (minWordDocListCount > DocumentResultWhereDictionary.DefaultSize) { docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(minWordDocListCount); } } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); //Merge bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1; int oneWordMaxCount = 0; if (oneWordOptimize) { //One word WordIndexForQuery wifq = wordIndexes[0]; //first word //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf; Entity.DocumentPositionList docList = wifq.WordIndex.GetNext(); int j = 0; while (docList.DocumentId >= 0) { //Entity.DocumentPositionList docList = wifq.WordIndex[j]; Core.SFQL.Parse.DocumentResultPoint drp; drp.pDocumentResult = null; if (j > MinResultCount) { if (oneWordMaxCount > docList.Count) { j++; docList = wifq.WordIndex.GetNext(); continue; } } else { if (oneWordMaxCount < docList.Count) { oneWordMaxCount = docList.Count; } } long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(docList.DocumentId)) { notInDict = true; } } if (!notInDict) { if (upDict == null) { docIdRank.Add(docList.DocumentId, score); } else { if (!upDict.Not) { if (upDict.ContainsKey(docList.DocumentId)) { docIdRank.Add(docList.DocumentId, score); } } else { if (!upDict.ContainsKey(docList.DocumentId)) { docIdRank.Add(docList.DocumentId, score); } } } } j++; docList = wifq.WordIndex.GetNext(); } } else { int wordIndexesLen = wordIndexes.Length; WordIndexForQuery fstWifq = wordIndexes[0]; //first word Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext(); Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen]; docListArr[0] = fstDocList; while (fstDocList.DocumentId >= 0) { int curWord = 1; int firstDocId = fstDocList.DocumentId; while (curWord < wordIndexesLen) { docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId); if (docListArr[curWord].DocumentId < 0) { if ((wordIndexes[curWord].Flags & WordInfo.Flag.Or) != 0) { curWord++; continue; } else { break; } } curWord++; } //While if (curWord >= wordIndexesLen) { //Matched long totalScore = 0; for (int i = 0; i < wordIndexesLen; i++) { WordIndexForQuery wifq = wordIndexes[i]; Entity.DocumentPositionList docList = docListArr[i]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } totalScore += score; } bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(firstDocId)) { notInDict = true; } } if (!notInDict) { if (upDict == null) { docIdRank.Add(firstDocId, totalScore); } else { if (!upDict.Not) { if (upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } else { if (!upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } } } } fstDocList = fstWifq.WordIndex.GetNext(); docListArr[0] = fstDocList; } } //Merge score if upDict != null if (upDict != null) { if (!upDict.Not) { foreach (int docid in docIdRank.Keys) { DocumentResult *upDrp; if (upDict.TryGetValue(docid, out upDrp)) { DocumentResult *drpResult; if (docIdRank.TryGetValue(docid, out drpResult)) { drpResult->Score += upDrp->Score; } } } } } DeleteProvider delProvider = _DBProvider.DelProvider; int delCount = delProvider.Filter(docIdRank); if (oneWordOptimize && _QueryParameter.CanLoadPartOfDocs && upDict == null) { docIdRank.RelTotalCount = wordIndexes[0].RelTotalCount - delCount; } else { docIdRank.RelTotalCount = docIdRank.Count; } performanceReport.Stop(); }
unsafe private void CalculateWithPosition(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { if (UseMatch(wordIndexes)) { CalculateWithPositionMatch(upDict, ref docIdRank, wordIndexes); return; } Array.Sort(wordIndexes); AdjustSort(wordIndexes); MinResultCount = _DBProvider.Table.GroupByLimit; //Get max word doc list count int minWordDocListCount = 1 * 1024 * 1024; //1M foreach (WordIndexForQuery wifq in wordIndexes) { minWordDocListCount = Math.Min(minWordDocListCount, wifq.WordIndex.Count); } Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate"); //Merge bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) && wordIndexes.Length == 1 && _NotInDict == null && _QueryParameter.End >= 0 && !_QueryParameter.NeedDistinct; if (oneWordOptimize) { IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(OneWordOptimize), DBProvider, _QueryParameter.End, _QueryParameter.OrderBy, _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy, _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider), _QueryParameter.UntokenizedTreeOnRoot, wordIndexes); try { qOptimize.CalculateOptimize(upDict, ref docIdRank); return; } finally { performanceReport.Stop(); } } if (this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) && _NotInDict == null && _QueryParameter.End >= 0 && !_QueryParameter.NeedDistinct) { IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(ContainsOptimize), DBProvider, _QueryParameter.End, _QueryParameter.OrderBy, _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy, _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider), _QueryParameter.UntokenizedTreeOnRoot, wordIndexes); try { qOptimize.CalculateOptimize(upDict, ref docIdRank); return; } finally { performanceReport.Stop(); } //if (qOptimize.Argument.IsOrderByScoreDesc()) //{ //} } if (docIdRank.Count == 0) { if (minWordDocListCount > DocumentResultWhereDictionary.DefaultSize) { docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(minWordDocListCount); } } { double ratio = 1; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } int wordIndexesLen = wordIndexes.Length; WordIndexForQuery fstWifq = wordIndexes[0]; //first word OriginalDocumentPositionList fstODPL = new OriginalDocumentPositionList(); fstWifq.WordIndex.GetNextOriginal(ref fstODPL); //Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext(); Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen]; //docListArr[0] = fstDocList; fstODPL.ToDocumentPositionList(ref docListArr[0]); OriginalDocumentPositionList odpl = new OriginalDocumentPositionList(); while (fstODPL.DocumentId >= 0) { int curWord = 1; int firstDocId = fstODPL.DocumentId; while (curWord < wordIndexesLen) { //docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId); wordIndexes[curWord].WordIndex.GetNextOriginalWithDocId(ref odpl, firstDocId); odpl.ToDocumentPositionList(ref docListArr[curWord]); if (docListArr[curWord].DocumentId < 0) { if ((wordIndexes[curWord].Flags & WordInfo.Flag.Or) != 0) { curWord++; continue; } else { break; } } curWord++; } //While if (curWord >= wordIndexesLen) { //Matched //Caculate score long totalScore = 0; Entity.DocumentPositionList lastDocList = new Hubble.Core.Entity.DocumentPositionList(); for (int i = 0; i < wordIndexesLen; i++) { WordIndexForQuery wifq = wordIndexes[i]; if (wifq.WordIndex.Count == 0) { //a^5000^0 b^5000^2^1 //if has a and hasn't b but b can be or //2010-09-30 eaglet continue; } Entity.DocumentPositionList docList = docListArr[i]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } double delta = 1; if (i > 0) { //Calculate with position double queryPositionDelta = wifq.FirstPosition - wordIndexes[i - 1].FirstPosition; double positionDelta = docList.FirstPosition - lastDocList.FirstPosition; delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * lastDocList.Count / (double)(wifq.QueryCount * wordIndexes[i - 1].QueryCount); } lastDocList = docList; totalScore += (long)(score * delta); } bool notInDict = false; if (_NotInDict != null) { if (_NotInDict.ContainsKey(firstDocId)) { notInDict = true; } } if (!notInDict) { if (upDict == null) { docIdRank.Add(firstDocId, totalScore); } else { if (!upDict.Not) { if (upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } else { if (!upDict.ContainsKey(firstDocId)) { docIdRank.Add(firstDocId, totalScore); } } } } }//if (curWord >= wordIndexesLen) //fstDocList = fstWifq.WordIndex.GetNext(); //docListArr[0] = fstDocList; fstWifq.WordIndex.GetNextOriginal(ref fstODPL); fstODPL.ToDocumentPositionList(ref docListArr[0]); } } //Merge score if upDict != null if (upDict != null) { if (!upDict.Not) { foreach (int docid in docIdRank.Keys) { DocumentResult *upDrp; if (upDict.TryGetValue(docid, out upDrp)) { DocumentResult *drpResult; if (docIdRank.TryGetValue(docid, out drpResult)) { drpResult->Score += upDrp->Score; } } } } } DeleteProvider delProvider = _DBProvider.DelProvider; int delCount = delProvider.Filter(docIdRank); if (oneWordOptimize && _QueryParameter.CanLoadPartOfDocs && upDict == null) { docIdRank.RelTotalCount = wordIndexes[0].RelTotalCount - delCount; } else { docIdRank.RelTotalCount = docIdRank.Count; } performanceReport.Stop(); }
private unsafe void CalculateNormal(Hubble.Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Hubble.Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank) { DBProvider dBProvider = Argument.DBProvider; bool needGroupBy = Argument.NeedGroupBy; bool needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions; ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot; Query.DocumentResult documentResult; Query.DocumentResult *drp = &documentResult; //vars for delete bool haveRecordsDeleted = dBProvider.DelProvider.Count > 0; int[] delDocs = null; int curDelIndex = 0; int curDelDocid = 0; Field[] orderByFields; DocId2LongComparer comparer = DocId2LongComparer.Generate( dBProvider, Argument.OrderBys, out orderByFields); if (haveRecordsDeleted) { delDocs = dBProvider.DelProvider.DelDocs; curDelDocid = delDocs[curDelIndex]; } double ratio = 1; WordIndexForQuery[] wordIndexes = WordIndexes; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } int wordIndexesLen = wordIndexes.Length; WordIndexForQuery fstWifq = wordIndexes[0]; //first word OriginalDocumentPositionList fstODPL = new OriginalDocumentPositionList(); //Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext(); Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen]; //docListArr[0] = fstDocList; //fstWifq.WordIndex.GetNextOriginal(ref fstODPL); //fstODPL.ToDocumentPositionList(ref docListArr[0]); GetNext(fstWifq, ref fstODPL, ref docListArr[0]); OriginalDocumentPositionList odpl = new OriginalDocumentPositionList(); Entity.DocumentPositionList lastDocList = new Hubble.Core.Entity.DocumentPositionList(); int top; //calculate top //If less than 100, set to 100 if (this.Argument.End >= 0) { top = (1 + this.Argument.End / 100) * 100; if (top <= 0) { top = 100; } //if (this.Argument.End * 2 > top) //{ // top *= 2; //} } else { top = int.MaxValue; } PriorQueue <Docid2Long> priorQueue = new PriorQueue <Docid2Long>(top, comparer); int rows = 0; Docid2Long cur = new Docid2Long(); Docid2Long last = new Docid2Long(); last.DocId = -1; while (fstODPL.DocumentId >= 0) { int curWord = 1; int firstDocId = fstODPL.DocumentId; while (curWord < wordIndexesLen) { //docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId); wordIndexes[curWord].WordIndex.GetNextOriginalWithDocId(ref odpl, firstDocId); odpl.ToDocumentPositionList(ref docListArr[curWord]); if (docListArr[curWord].DocumentId < 0) { if ((wordIndexes[curWord].Flags & WordInfo.Flag.Or) != 0) { curWord++; continue; } else { break; } } curWord++; } //While if (curWord >= wordIndexesLen) { //Process untokenized conditions. //If is not matched, get the next one. if (needFilterUntokenizedConditions) { int docId = firstDocId; drp->DocId = docId; drp->PayloadData = dBProvider.GetPayloadDataWithShareLock(docId); if (!ParseWhere.GetComparisionExpressionValue(dBProvider, drp, untokenizedTree)) { GetNext(fstWifq, ref fstODPL, ref docListArr[0]); continue; } } //Matched //Caculate score long totalScore = 0; lastDocList.Count = 0; lastDocList.FirstPosition = 0; for (int i = 0; i < wordIndexesLen; i++) { WordIndexForQuery wifq = wordIndexes[i]; if (wifq.WordIndex.Count == 0) { //a^5000^0 b^5000^2^1 //if has a and hasn't b but b can be or //2010-09-30 eaglet continue; } Entity.DocumentPositionList docList = docListArr[i]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } double delta = 1; if (i > 0) { //Calculate with position double queryPositionDelta = wifq.FirstPosition - wordIndexes[i - 1].FirstPosition; double positionDelta = docList.FirstPosition - lastDocList.FirstPosition; delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * lastDocList.Count / (double)(wifq.QueryCount * wordIndexes[i - 1].QueryCount); } lastDocList.Count = docList.Count; lastDocList.FirstPosition = docList.FirstPosition; totalScore += (long)(score * delta); } //End for cycle if (haveRecordsDeleted) { if (curDelIndex < delDocs.Length) { //If docid deleted, get next if (firstDocId == curDelDocid) { GetNext(fstWifq, ref fstODPL, ref docListArr[0]); continue; } else if (firstDocId > curDelDocid) { while (curDelIndex < delDocs.Length && curDelDocid < firstDocId) { curDelIndex++; if (curDelIndex >= delDocs.Length) { haveRecordsDeleted = false; break; } curDelDocid = delDocs[curDelIndex]; } if (curDelIndex < delDocs.Length) { if (firstDocId == curDelDocid) { GetNext(fstWifq, ref fstODPL, ref docListArr[0]); continue; } } } } } if (needGroupBy) { docIdRank.AddToGroupByCollection(firstDocId); } if (_HasRankField) { int rank = dBProvider.SharedPayloadProvider.GetPayloadRank(firstDocId); totalScore *= rank; if (totalScore < 0) { totalScore = long.MaxValue - 4000000; } } if (rows >= top) { rows++; cur.DocId = firstDocId; Docid2Long.Generate(ref cur, dBProvider, orderByFields, totalScore); if (comparer.Compare(last, cur) > 0) { priorQueue.Add(cur); last = priorQueue.Last; } } else { cur.DocId = firstDocId; Docid2Long.Generate(ref cur, dBProvider, orderByFields, totalScore); priorQueue.Add(cur); rows++; if (rows == top) { last = priorQueue.Last; } } //docIdRank.Add(firstDocId, totalScore); }//if (curWord >= wordIndexesLen) GetNext(fstWifq, ref fstODPL, ref docListArr[0]); //fstWifq.WordIndex.GetNextOriginal(ref fstODPL); //fstODPL.ToDocumentPositionList(ref docListArr[0]); } docIdRank.RelTotalCount = rows; foreach (Docid2Long docid2Long in priorQueue.ToArray()) { long score = comparer.GetScore(docid2Long); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score)); } docIdRank.Sorted = true; }
static public DocumentPositionList[] Deserialize(System.IO.Stream stream, ref int documentsCount, bool simple, out long wordCountSum) { wordCountSum = 0; int docsCount = VInt.sReadFromStream(stream); if (docsCount == 0) { //This index has skip doc index DeserializeSkipDocIndex(stream, true); docsCount = VInt.sReadFromStream(stream); } int relDocCount = docsCount; int lastDocId = VInt.sReadFromStream(stream); int count = VInt.sReadFromStream(stream); docsCount = Math.Min(docsCount, documentsCount); DocumentPositionList[] result = new DocumentPositionList[docsCount]; if (docsCount <= 0) { documentsCount = relDocCount; return(result); } if (!simple) { int firstPosition = VInt.sReadFromStream(stream); result[0] = new DocumentPositionList(lastDocId, count / 8, (Int16)(count % 8), firstPosition); } else { result[0] = new DocumentPositionList(lastDocId, count / 8, (Int16)(count % 8)); } if (docsCount == 1) { wordCountSum = 1; } for (int i = 1; i < docsCount; i++) { lastDocId = VInt.sReadFromStream(stream) + lastDocId; count = VInt.sReadFromStream(stream); int docCount = (Int16)(count / 8); if (docCount >= 32768) { docCount = 32767; } if (!simple) { int firstPosition = VInt.sReadFromStream(stream); result[i] = new DocumentPositionList(lastDocId, docCount, (Int16)(count % 8), firstPosition); } else { result[i] = new DocumentPositionList(lastDocId, docCount, (Int16)(count % 8)); } wordCountSum += docCount; } documentsCount = relDocCount; return(result); }
static public void Serialize(DocumentPositionList first, int docsCount, IEnumerable <DocumentPositionList> docPositions, System.IO.Stream stream, bool simple) { //int docsCount = docPositions.Count; //Write documets count VInt.sWriteToStream(docsCount, stream); //DocumentPositionList first = docPositions.GetEnumerator(); //Write first document id int lstDocId = first.DocumentId; VInt.sWriteToStream(lstDocId, stream); int count = first.Count; if (count >= 32768) { count = 32767; } count *= 8; //Shift 3 bit count += first._TotalWordsInThisDocumentIndex; VInt.sWriteToStream(count, stream); if (!simple) { VInt.sWriteToStream(first.FirstPosition, stream); } int i = 0; foreach (DocumentPositionList docPosition in docPositions) { i++; if (i == 1) { continue; } VInt.sWriteToStream(docPosition.DocumentId - lstDocId, stream); count = docPosition.Count; if (count >= 32768) { count = 32767; } count *= 8; //Shift 3 bit count += docPosition._TotalWordsInThisDocumentIndex; VInt.sWriteToStream(count, stream); if (!simple) { VInt.sWriteToStream(docPosition.FirstPosition, stream); } lstDocId = docPosition.DocumentId; } byte[] lstDocIdBuf = BitConverter.GetBytes(lstDocId); stream.Write(lstDocIdBuf, 0, lstDocIdBuf.Length); }
public void ToDocumentPositionList(ref DocumentPositionList dpl) { dpl.Set(ref this); }