unsafe private bool GetNext(WordIndexForQuery fstWifq, ref OriginalDocumentPositionList fstODPL, ref Entity.DocumentPositionList fstDPL) { fstWifq.WordIndex.GetNextOriginal(ref fstODPL); fstODPL.ToDocumentPositionList(ref fstDPL); return(fstODPL.DocumentId >= 0); #if a if (!_HasRandField) { } else { if (_CurDocidPayloadIndex >= _CurDocidPayloadsCount) { _CurDocidPayloadsCount = LoadDocIdPayloads(); if (_CurDocidPayloadsCount <= 0) { fstODPL.DocumentId = -1; return; //return new OriginalDocumentPositionList(-1); } } fstODPL.DocumentId = _DocidPayloads[_CurDocidPayloadIndex].DocumentId; fstODPL.CountAndWordCount = _DocidPayloads[_CurDocidPayloadIndex].CountAndWordCount; //odpl = _DocidPayloads[_CurDocidPayloadIndex++]; _CurDocidPayloadIndex++; return(true); } #endif }
public void Run() { if (Parameters.Count != 3) { throw new StoredProcException("First parameter is table name, second parameter is field name, third parameter is words. SP_GetIDF 'tablename', 'fieldname', 'abc news'"); } string tableName = Parameters[0]; string fieldName = Parameters[1]; Data.DBProvider dbProvider = Data.DBProvider.GetDBProvider(tableName); if (dbProvider == null) { throw new StoredProcException(string.Format("Table name {0} does not exist!", tableName)); } Hubble.Core.Index.InvertedIndex invertedIndex = dbProvider.GetInvertedIndex(fieldName); if (invertedIndex == null) { throw new StoredProcException(string.Format("Field name {0} does not exist or is not the tokenized index field!", fieldName)); } string queryStr = Parameters[2]; List <WordInfo> wordInfos = ParseWhere.GetWordInfoList(queryStr); Dictionary <string, WordIndexForQuery> wordIndexDict = new Dictionary <string, WordIndexForQuery>(); foreach (Hubble.Core.Entity.WordInfo wordInfo in wordInfos) { WordIndexForQuery wifq; if (!wordIndexDict.TryGetValue(wordInfo.Word, out wifq)) { //Hubble.Core.Index.WordIndexReader wordIndex = InvertedIndex.GetWordIndex(wordInfo.Word, CanLoadPartOfDocs); //Get whole index Hubble.Core.Index.WordIndexReader wordIndex = invertedIndex.GetWordIndex(wordInfo.Word, false, true); //Only get step doc index if (wordIndex == null) { wordIndexDict.Add(wordInfo.Word, null); continue; } wifq = new WordIndexForQuery(wordIndex, invertedIndex.DocumentCount, wordInfo.Rank, 1); wifq.QueryCount = 1; wifq.FirstPosition = wordInfo.Position; wordIndexDict.Add(wordInfo.Word, wifq); } else { wifq.WordRank += wordInfo.Rank; wifq.QueryCount++; } //wordIndexList[wordIndexList.Count - 1].Rank += wordInfo.Rank; } AddColumn("Word"); AddColumn("TF"); AddColumn("IDF"); AddColumn("T_D"); AddColumn("TotalDoucments"); AddColumn("TF_IDF"); int totalDocuments = invertedIndex.DocumentCount; foreach (string word in wordIndexDict.Keys) { NewRow(); WordIndexForQuery wifq = wordIndexDict[word]; OutputValue("Word", word); if (wifq == null) { OutputValue("TF", 0); OutputValue("T_D", 0); OutputValue("TotalDoucments", 0); OutputValue("IDF", 0); OutputValue("TF_IDF", 0); } else { double idf = Math.Log((double)totalDocuments / (double)wifq.RelTotalCount); OutputValue("TF", wifq.QueryCount); OutputValue("T_D", wifq.RelTotalCount); OutputValue("TotalDoucments", totalDocuments); OutputValue("IDF", idf); OutputValue("TF_IDF", wifq.QueryCount * idf); } } }
private unsafe void CalculateNormal(Hubble.Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Hubble.Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank) { DBProvider dBProvider = Argument.DBProvider; bool needGroupBy = Argument.NeedGroupBy; bool needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions; ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot; Query.DocumentResult documentResult; Query.DocumentResult *drp = &documentResult; //vars for delete bool haveRecordsDeleted = dBProvider.DelProvider.Count > 0; int[] delDocs = null; int curDelIndex = 0; int curDelDocid = 0; Field[] orderByFields; DocId2LongComparer comparer = DocId2LongComparer.Generate( dBProvider, Argument.OrderBys, out orderByFields); if (haveRecordsDeleted) { delDocs = dBProvider.DelProvider.DelDocs; curDelDocid = delDocs[curDelIndex]; } double ratio = 1; WordIndexForQuery[] wordIndexes = WordIndexes; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } int wordIndexesLen = wordIndexes.Length; WordIndexForQuery fstWifq = wordIndexes[0]; //first word OriginalDocumentPositionList fstODPL = new OriginalDocumentPositionList(); //Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext(); Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen]; //docListArr[0] = fstDocList; //fstWifq.WordIndex.GetNextOriginal(ref fstODPL); //fstODPL.ToDocumentPositionList(ref docListArr[0]); GetNext(fstWifq, ref fstODPL, ref docListArr[0]); OriginalDocumentPositionList odpl = new OriginalDocumentPositionList(); Entity.DocumentPositionList lastDocList = new Hubble.Core.Entity.DocumentPositionList(); int top; //calculate top //If less than 100, set to 100 if (this.Argument.End >= 0) { top = (1 + this.Argument.End / 100) * 100; if (top <= 0) { top = 100; } //if (this.Argument.End * 2 > top) //{ // top *= 2; //} } else { top = int.MaxValue; } PriorQueue <Docid2Long> priorQueue = new PriorQueue <Docid2Long>(top, comparer); int rows = 0; Docid2Long cur = new Docid2Long(); Docid2Long last = new Docid2Long(); last.DocId = -1; while (fstODPL.DocumentId >= 0) { int curWord = 1; int firstDocId = fstODPL.DocumentId; while (curWord < wordIndexesLen) { //docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId); wordIndexes[curWord].WordIndex.GetNextOriginalWithDocId(ref odpl, firstDocId); odpl.ToDocumentPositionList(ref docListArr[curWord]); if (docListArr[curWord].DocumentId < 0) { if ((wordIndexes[curWord].Flags & WordInfo.Flag.Or) != 0) { curWord++; continue; } else { break; } } curWord++; } //While if (curWord >= wordIndexesLen) { //Process untokenized conditions. //If is not matched, get the next one. if (needFilterUntokenizedConditions) { int docId = firstDocId; drp->DocId = docId; drp->PayloadData = dBProvider.GetPayloadDataWithShareLock(docId); if (!ParseWhere.GetComparisionExpressionValue(dBProvider, drp, untokenizedTree)) { GetNext(fstWifq, ref fstODPL, ref docListArr[0]); continue; } } //Matched //Caculate score long totalScore = 0; lastDocList.Count = 0; lastDocList.FirstPosition = 0; for (int i = 0; i < wordIndexesLen; i++) { WordIndexForQuery wifq = wordIndexes[i]; if (wifq.WordIndex.Count == 0) { //a^5000^0 b^5000^2^1 //if has a and hasn't b but b can be or //2010-09-30 eaglet continue; } Entity.DocumentPositionList docList = docListArr[i]; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } double delta = 1; if (i > 0) { //Calculate with position double queryPositionDelta = wifq.FirstPosition - wordIndexes[i - 1].FirstPosition; double positionDelta = docList.FirstPosition - lastDocList.FirstPosition; delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * docList.Count * lastDocList.Count / (double)(wifq.QueryCount * wordIndexes[i - 1].QueryCount); } lastDocList.Count = docList.Count; lastDocList.FirstPosition = docList.FirstPosition; totalScore += (long)(score * delta); } //End for cycle if (haveRecordsDeleted) { if (curDelIndex < delDocs.Length) { //If docid deleted, get next if (firstDocId == curDelDocid) { GetNext(fstWifq, ref fstODPL, ref docListArr[0]); continue; } else if (firstDocId > curDelDocid) { while (curDelIndex < delDocs.Length && curDelDocid < firstDocId) { curDelIndex++; if (curDelIndex >= delDocs.Length) { haveRecordsDeleted = false; break; } curDelDocid = delDocs[curDelIndex]; } if (curDelIndex < delDocs.Length) { if (firstDocId == curDelDocid) { GetNext(fstWifq, ref fstODPL, ref docListArr[0]); continue; } } } } } if (needGroupBy) { docIdRank.AddToGroupByCollection(firstDocId); } if (_HasRankField) { int rank = dBProvider.SharedPayloadProvider.GetPayloadRank(firstDocId); totalScore *= rank; if (totalScore < 0) { totalScore = long.MaxValue - 4000000; } } if (rows >= top) { rows++; cur.DocId = firstDocId; Docid2Long.Generate(ref cur, dBProvider, orderByFields, totalScore); if (comparer.Compare(last, cur) > 0) { priorQueue.Add(cur); last = priorQueue.Last; } } else { cur.DocId = firstDocId; Docid2Long.Generate(ref cur, dBProvider, orderByFields, totalScore); priorQueue.Add(cur); rows++; if (rows == top) { last = priorQueue.Last; } } //docIdRank.Add(firstDocId, totalScore); }//if (curWord >= wordIndexesLen) GetNext(fstWifq, ref fstODPL, ref docListArr[0]); //fstWifq.WordIndex.GetNextOriginal(ref fstODPL); //fstODPL.ToDocumentPositionList(ref docListArr[0]); } docIdRank.RelTotalCount = rows; foreach (Docid2Long docid2Long in priorQueue.ToArray()) { long score = comparer.GetScore(docid2Long); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score)); } docIdRank.Sorted = true; }
unsafe private void CalculateWithPositionOrderByScoreDesc(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes) { DBProvider dbProvider = Argument.DBProvider; bool needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions; ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot; if (upDict != null) { throw new ParseException("UpDict is not null!"); } //Calculate top int top; if (Argument.End >= 0) { top = (1 + Argument.End / 100) * 100; if (top <= 0) { top = 100; } } else { top = int.MaxValue; } PriorQueue <Docid2Long> priorQueue = null; List <Docid2Long> docid2longList = null; if (top == int.MaxValue) { docid2longList = new List <Docid2Long>(); } else { priorQueue = new PriorQueue <Docid2Long>(top, new DocIdLongComparer(false)); } long lastMinScore = 0; int rows = 0; Core.SFQL.Parse.DocumentResultWhereDictionary groupByDict = Argument.NeedGroupBy ? docIdRank : null; MultiWordsDocIdEnumerator mwde = new MultiWordsDocIdEnumerator(wordIndexes, dbProvider, groupByDict, -1, needFilterUntokenizedConditions); //Changed at 2012-3-18, top optimize will effect search result, disable it. //MultiWordsDocIdEnumerator mwde = new MultiWordsDocIdEnumerator(wordIndexes, dbProvider, groupByDict, top, // needFilterUntokenizedConditions); Entity.OriginalDocumentPositionList odpl = new Hubble.Core.Entity.OriginalDocumentPositionList(); mwde.GetNextOriginal(ref odpl); Entity.DocumentPositionList lastDocList = new Hubble.Core.Entity.DocumentPositionList(); double ratio = 1; if (wordIndexes.Length > 1) { ratio = (double)2 / (double)(wordIndexes.Length - 1); } Query.DocumentResult documentResult; Query.DocumentResult *drp = &documentResult; int skipCount = 0; //skip by filter untokenized conditions while (odpl.DocumentId >= 0) { //Process untokenized conditions. //If is not matched, get the next one. if (needFilterUntokenizedConditions) { int docId = odpl.DocumentId; drp->DocId = docId; drp->PayloadData = dbProvider.GetPayloadDataWithShareLock(docId); if (!ParseWhere.GetComparisionExpressionValue(dbProvider, drp, untokenizedTree)) { mwde.GetNextOriginal(ref odpl); skipCount++; continue; } } //Matched //Caculate score #region Caclate score long totalScore = 0; lastDocList.Count = 0; lastDocList.FirstPosition = 0; int lastWifqIndex = 0; for (int i = 0; i < mwde.SelectedCount; i++) { int index = mwde.SelectedIndexes[i]; WordIndexForQuery wifq = mwde.WordIndexes[index]; Int16 count = (Int16)mwde.SelectedDocLists[i].Count; int firstPosition = mwde.SelectedDocLists[i].FirstPosition; int totalWordsInThisDocument = mwde.SelectedDocLists[i].TotalWordsInThisDocument; long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)count * (long)1000000 / ((long)wifq.Sum_d_t * (long)totalWordsInThisDocument); if (score < 0) { //Overflow score = long.MaxValue - 4000000; } double delta = 1; if (i > 0) { //Calculate with position double queryPositionDelta = wifq.FirstPosition - wordIndexes[lastWifqIndex].FirstPosition; double positionDelta = firstPosition - lastDocList.FirstPosition; delta = Math.Abs(queryPositionDelta - positionDelta); if (delta < 0.031) { delta = 0.031; } else if (delta <= 1.1) { delta = 0.5; } else if (delta <= 2.1) { delta = 1; } delta = Math.Pow((1 / delta), ratio) * count * lastDocList.Count / (double)(wifq.QueryCount * wordIndexes[lastWifqIndex].QueryCount); } lastDocList.Count = count; lastDocList.FirstPosition = firstPosition; lastWifqIndex = index; totalScore += (long)(score * delta); } //End of score calculation if (_HasRankField) { int rank = dbProvider.SharedPayloadProvider.GetPayloadRank(odpl.DocumentId); totalScore *= rank; if (totalScore < 0) { totalScore = long.MaxValue - 4000000; } } //all of the words matched //10 times if (mwde.SelectedCount == wordIndexes.Length) { totalScore *= 10; if (totalScore < 0) { totalScore = long.MaxValue - 4000000; } } #endregion //Insert to prior queue if (rows >= top) { if (lastMinScore < totalScore) { priorQueue.Add(new Docid2Long(odpl.DocumentId, totalScore)); lastMinScore = priorQueue.Last.Value1; } } else { if (top == int.MaxValue) { docid2longList.Add(new Docid2Long(odpl.DocumentId, totalScore)); } else { priorQueue.Add(new Docid2Long(odpl.DocumentId, totalScore)); rows++; if (rows == top) { lastMinScore = priorQueue.Last.Value1; } } } mwde.GetNextOriginal(ref odpl); } docIdRank.RelTotalCount = mwde.TotalDocIdCount - skipCount; Docid2Long[] docid2longArr; if (top == int.MaxValue) { docid2longList.Sort(new DocIdLongComparer(false)); docid2longArr = docid2longList.ToArray(); } else { docid2longArr = priorQueue.ToArray(); } foreach (Docid2Long docid2Long in docid2longArr) { long score = docid2Long.Value1; if (score < 0) { //Overflow score = long.MaxValue - 4000000; } docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score)); } docIdRank.Sorted = true; }
internal int Index; //index of _WordIndexes internal WordDocIdEntity(int index, WordIndexForQuery wordIndex) { this.Index = index; this.WordIndex = wordIndex; }
/// <summary> /// order by except only order by score desc. /// </summary> /// <param name="upDict"></param> /// <param name="docIdRank"></param> unsafe public void CalculateOptimizeNormalOrderBy(Core.SFQL.Parse.DocumentResultWhereDictionary upDict, ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank) { DBProvider dBProvider = Argument.DBProvider; Argument.DBProvider.SharedPayloadProvider.EnterPayloladShareLock(); bool needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions; ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot; Query.DocumentResult documentResult; Query.DocumentResult *drp = &documentResult; bool orderByIncludingScore = Argument.OrderByIncludingScore(); try { Field[] orderByFields; DocId2LongComparer comparer = DocId2LongComparer.Generate( dBProvider, Argument.OrderBys, out orderByFields); bool needGroupBy = Argument.NeedGroupBy; WordIndexForQuery wifq = WordIndexes[0]; _IndexReader = wifq.WordIndex.IndexReader; Data.Field rankField = Argument.DBProvider.GetField("Rank"); if (rankField != null) { if (rankField.DataType == Hubble.Core.Data.DataType.Int && rankField.IndexType == Hubble.Core.Data.Field.Index.Untokenized) { _HasRandField = true; _RankTab = rankField.TabIndex; _DocidPayloads = new OriginalDocumentPositionList[2 * 1024]; _CurDocidPayloadIndex = _DocidPayloads.Length; } } if (_IndexReader != null) { int top; //vars for delete bool haveRecordsDeleted = dBProvider.DelProvider.Count > 0; int[] delDocs = null; int curDelIndex = 0; int curDelDocid = 0; int groupByCount = 0; int groupByLen = dBProvider.Table.GroupByLimit; int groupByStep = 1; int groupByIndex = 0; if (needGroupBy) { groupByStep = wifq.RelTotalCount / groupByLen; if (groupByStep <= 0) { groupByStep = 1; } } if (haveRecordsDeleted) { delDocs = dBProvider.DelProvider.DelDocs; curDelDocid = delDocs[curDelIndex]; } try { //calculate top //If less than 100, set to 100 if (this.Argument.End >= 0) { top = (1 + this.Argument.End / 100) * 100; if (top <= 0) { top = 100; } //if (this.Argument.End * 2 > top) //{ // top *= 2; //} } else { top = int.MaxValue; } PriorQueue <Docid2Long> priorQueue = new PriorQueue <Docid2Long>(top, comparer); int rows = 0; Entity.OriginalDocumentPositionList docList = new OriginalDocumentPositionList(); bool notEOF = GetNext(ref docList); Index.WordIndexReader wordIndexReader = wifq.WordIndex; Docid2Long last = new Docid2Long(); last.DocId = -1; int relCount = 0; while (notEOF) { //Process untokenized conditions. //If is not matched, get the next one. if (needFilterUntokenizedConditions) { int docId = docList.DocumentId; drp->DocId = docId; drp->PayloadData = dBProvider.GetPayloadDataWithShareLock(docId); if (!ParseWhere.GetComparisionExpressionValue(dBProvider, drp, untokenizedTree)) { notEOF = GetNext(ref docList); continue; } } //Process deleted records if (haveRecordsDeleted) { if (curDelIndex < delDocs.Length) { //If docid deleted, get next if (docList.DocumentId == curDelDocid) { notEOF = GetNext(ref docList); continue; } else if (docList.DocumentId > curDelDocid) { while (curDelIndex < delDocs.Length && curDelDocid < docList.DocumentId) { curDelIndex++; if (curDelIndex >= delDocs.Length) { haveRecordsDeleted = false; break; } curDelDocid = delDocs[curDelIndex]; } if (curDelIndex < delDocs.Length) { if (docList.DocumentId == curDelDocid) { notEOF = GetNext(ref docList); continue; } } } } } if (needGroupBy) { if (groupByCount < groupByLen) { if (groupByIndex >= groupByStep) { groupByIndex = 0; } if (groupByIndex == 0) { docIdRank.AddToGroupByCollection(docList.DocumentId); groupByCount++; } groupByIndex++; } } relCount++; Docid2Long cur = new Docid2Long(); if (rows >= top) { long score = 1; if (orderByIncludingScore) { int wordCount = docList.CountAndWordCount / 8; //one word, score = count score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)wordCount * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); } cur.DocId = docList.DocumentId; cur.Rank = docList.TotalWordsInThisDocument; Docid2Long.Generate(ref cur, dBProvider, orderByFields, score); if (comparer.Compare(last, cur) > 0) { priorQueue.Add(cur); last = priorQueue.Last; } } else { long score = 1; if (orderByIncludingScore) { int wordCount = docList.CountAndWordCount / 8; //one word, score = count score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)wordCount * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument); } if (score < 0) { //Overflow score = long.MaxValue - 4000000; } cur.DocId = docList.DocumentId; cur.Rank = docList.TotalWordsInThisDocument; Docid2Long.Generate(ref cur, dBProvider, orderByFields, score); priorQueue.Add(cur); rows++; if (rows == top) { last = priorQueue.Last; } } notEOF = GetNext(ref docList); } docIdRank.RelTotalCount = relCount; foreach (Docid2Long docid2Long in priorQueue.ToArray()) { long score = comparer.GetScore(docid2Long); //use Rank store TotalWordsInThisDocument if (score < 0) { //Overflow score = long.MaxValue - 4000000; } docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score)); } } finally { } docIdRank.Sorted = true; } } finally { Argument.DBProvider.SharedPayloadProvider.LeavePayloadShareLock(); } }