コード例 #1
0
        unsafe private void CalculateWithPositionMatch(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                                       ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            Array.Sort(wordIndexes);

            MinResultCount = _DBProvider.Table.GroupByLimit;

            double ratio = 1;

            if (wordIndexes.Length > 1)
            {
                ratio = (double)2 / (double)(wordIndexes.Length - 1);
            }

            //Get max word doc list count
            int maxWordDocListCount = 0;
            int documentSum         = 0;

            foreach (WordIndexForQuery wifq in wordIndexes)
            {
                maxWordDocListCount += wifq.WordIndex.Count;
            }

            maxWordDocListCount += maxWordDocListCount / 2;

            if (maxWordDocListCount > 1024 * 1024)
            {
                maxWordDocListCount = 1024 * 1024;
            }

            if (docIdRank.Count == 0)
            {
                if (maxWordDocListCount > DocumentResultWhereDictionary.DefaultSize)
                {
                    docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(maxWordDocListCount);
                }
            }

            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate");

            //Merge
            bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1;

            for (int i = 0; i < wordIndexes.Length; i++)
            {
                WordIndexForQuery wifq = wordIndexes[i];

                //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf;

                Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();
                int j = 0;
                int oneWordMaxCount = 0;

                while (docList.DocumentId >= 0)
                {
                    Core.SFQL.Parse.DocumentResultPoint drp;
                    drp.pDocumentResult = null;

                    if (oneWordOptimize)
                    {
                        if (j > MinResultCount)
                        {
                            if (oneWordMaxCount > docList.Count)
                            {
                                docList = wifq.WordIndex.GetNext();
                                j++;

                                continue;
                            }
                        }
                        else
                        {
                            if (oneWordMaxCount < docList.Count)
                            {
                                oneWordMaxCount = docList.Count;
                            }
                        }
                    }

                    if (j > wifq.RelTotalCount)
                    {
                        break;
                    }

                    long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }

                    bool exits = drp.pDocumentResult != null;

                    if (!exits && i > 0)
                    {
                        exits = docIdRank.TryGetValue(docList.DocumentId, out drp);
                    }

                    if (exits)
                    {
                        drp.pDocumentResult->Score += score;

                        double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition;
                        double positionDelta      = docList.FirstPosition - drp.pDocumentResult->LastPosition;

                        double delta = Math.Abs(queryPositionDelta - positionDelta);

                        if (delta < 0.031)
                        {
                            delta = 0.031;
                        }
                        else if (delta <= 1.1)
                        {
                            delta = 0.5;
                        }
                        else if (delta <= 2.1)
                        {
                            delta = 1;
                        }

                        delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount /
                                (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount);

                        //some words missed
                        //if (i - drp.pDocumentResult->LastIndex > 1)
                        //{
                        //    int sumWordRank = 10;
                        //    for (int k = drp.pDocumentResult->LastIndex + 1; k < i; k++)
                        //    {
                        //        sumWordRank += wordIndexes[k].WordRank;
                        //    }

                        //    delta /= (double)sumWordRank;
                        //}

                        drp.pDocumentResult->Score        = (long)(drp.pDocumentResult->Score * delta);
                        drp.pDocumentResult->LastIndex    = (UInt16)i;
                        drp.pDocumentResult->LastPosition = docList.FirstPosition;
                        drp.pDocumentResult->LastCount    = (UInt16)docList.Count;
                        drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition;
                    }
                    else
                    {
                        //some words missed
                        //if (i > 0)
                        //{
                        //    int sumWordRank = 10;
                        //    for (int k = 0; k < i; k++)
                        //    {
                        //        sumWordRank += wordIndexes[k].WordRank;
                        //    }

                        //    double delta = 1 / (double)sumWordRank;
                        //    score = (long)(score * delta);
                        //}

                        bool notInDict = false;

                        if (_NotInDict != null)
                        {
                            if (_NotInDict.ContainsKey(docList.DocumentId))
                            {
                                notInDict = true;
                            }
                        }

                        if (!notInDict)
                        {
                            if (upDict == null)
                            {
                                DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                                docIdRank.Add(docList.DocumentId, docResult);
                            }
                            else
                            {
                                if (!upDict.Not)
                                {
                                    if (upDict.ContainsKey(docList.DocumentId))
                                    {
                                        DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                                        docIdRank.Add(docList.DocumentId, docResult);
                                    }
                                }
                                else
                                {
                                    if (!upDict.ContainsKey(docList.DocumentId))
                                    {
                                        DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                                        docIdRank.Add(docList.DocumentId, docResult);
                                    }
                                }
                            }
                        }
                    }

                    docList = wifq.WordIndex.GetNext();
                    j++;

                    if (j > wifq.WordIndex.Count)
                    {
                        break;
                    }
                }
            }

            //Merge score if upDict != null
            if (upDict != null)
            {
                if (!upDict.Not)
                {
                    foreach (int docid in docIdRank.Keys)
                    {
                        DocumentResult *upDrp;

                        if (upDict.TryGetValue(docid, out upDrp))
                        {
                            DocumentResult *drpResult;
                            if (docIdRank.TryGetValue(docid, out drpResult))
                            {
                                drpResult->Score += upDrp->Score;
                            }
                        }
                    }
                }
            }

            //some words missed
            //if (wordIndexes.Length > 1)
            //{
            //    List<DocumentResult> reduceDocs = new List<DocumentResult>(docIdRank.Count);
            //    int lstIndex = wordIndexes.Length - 1;
            //    foreach (Core.SFQL.Parse.DocumentResultPoint drp in docIdRank.Values)
            //    {
            //        DocumentResult* dr = drp.pDocumentResult;
            //        //DocumentResult* dr1 = drp.pDocumentResult;
            //        if (dr->LastIndex != lstIndex)
            //        {
            //            int sumWordRank = 10;
            //            for (int k = dr->LastIndex + 1; k <= lstIndex; k++)
            //            {
            //                sumWordRank += wordIndexes[k].WordRank;
            //            }

            //            double delta = 1 / (double)sumWordRank;

            //            dr->Score = (long)((double)dr->Score * delta);
            //        }

            //        if (dr->Score < 0)
            //        {
            //            dr->Score = long.MaxValue / 10;
            //        }
            //    }
            //}

            performanceReport.Stop();

            documentSum += docIdRank.Count;

            if (documentSum > _TotalDocuments)
            {
                documentSum = _TotalDocuments;
            }

            DeleteProvider delProvider = _DBProvider.DelProvider;
            int            deleteCount = delProvider.Filter(docIdRank);

            if (_QueryParameter.CanLoadPartOfDocs && upDict == null)
            {
                if (docIdRank.Count < wordIndexes[wordIndexes.Length - 1].RelTotalCount)
                {
                    if (wordIndexes.Length > 1)
                    {
                        if (wordIndexes[wordIndexes.Length - 1].RelTotalCount > _DBProvider.MaxReturnCount)
                        {
                            documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount - _DBProvider.MaxReturnCount;
                        }

                        if (documentSum > _TotalDocuments)
                        {
                            documentSum = _TotalDocuments;
                        }

                        docIdRank.RelTotalCount = documentSum;
                    }
                    else
                    {
                        docIdRank.RelTotalCount = wordIndexes[wordIndexes.Length - 1].RelTotalCount;
                    }
                }
            }

            docIdRank.RelTotalCount -= deleteCount;
        }
コード例 #2
0
        /// <summary>
        /// Order by score desc
        /// and only one expression in the banch of expression tree.
        /// and more than two words
        /// </summary>
        /// <param name="upDict"></param>
        /// <param name="docIdRank"></param>
        /// <param name="wordIndexes"></param>
        unsafe private void CalculateWithPositionOrderByScoreDesc11(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                                                    ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            if (upDict != null)
            {
                throw new ParseException("UpDict is not null!");
            }

            Array.Sort(wordIndexes);

            //Calculate top
            int top;

            if (this._QueryParameter.End >= 0)
            {
                top = (1 + this._QueryParameter.End / 100) * 100;

                if (top <= 0)
                {
                    top = 100;
                }

                //if (this._QueryParameter.End * 2 > top)
                //{
                //    top *= 2;
                //}
            }
            else
            {
                top = int.MaxValue;
            }

            double ratio = 1;

            if (wordIndexes.Length > 1)
            {
                ratio = (double)2 / (double)(wordIndexes.Length - 1);
            }

            //Get max word doc list count
            int maxWordDocListCount = 0;
            int documentSum         = 0;

            foreach (WordIndexForQuery wifq in wordIndexes)
            {
                maxWordDocListCount += wifq.WordIndex.Count;
            }

            maxWordDocListCount += maxWordDocListCount / 2;

            if (maxWordDocListCount > 1024 * 1024)
            {
                maxWordDocListCount = 1024 * 1024;
            }

            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate");

            bool groupbyScanAll = false;

            //Match for group by
            if (this._QueryParameter.NeedGroupBy)
            {
                groupbyScanAll = true;

                int    groupbyContainsCount = 0;
                int    groupbyLimit         = _DBProvider.Table.GroupByLimit;
                BitSet bitSet = new BitSet();

                for (int i = 0; i < wordIndexes.Length; i++)
                {
                    WordIndexForQuery           wifq    = wordIndexes[i];
                    Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();

                    while (docList.DocumentId >= 0)
                    {
                        if (bitSet.ForceAdd(docList.DocumentId))
                        {
                            groupbyContainsCount++;
                        }

                        if (groupbyContainsCount >= groupbyLimit)
                        {
                            groupbyScanAll = false;
                            break;
                        }

                        docList = wifq.WordIndex.GetNext();
                    }

                    wifq.WordIndex.Reset();

                    if (!groupbyScanAll)
                    {
                        break;
                    }
                }

                AscIntList groupByCollect = new AscIntList();
                groupByCollect.AddRange(bitSet);
            }

            //Merge
            int indexInTop = 0;

            for (int i = 0; i < wordIndexes.Length; i++)
            {
                if (docIdRank.Count >= top)
                {
                    break;
                }

                indexInTop = i;

                WordIndexForQuery wifq = wordIndexes[i];

                //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf;

                Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();
                int j = 0;

                while (docList.DocumentId >= 0)
                {
                    Core.SFQL.Parse.DocumentResultPoint drp;
                    drp.pDocumentResult = null;

                    long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }
                    else
                    {
                        switch (i)
                        {
                        case 0:
                            score *= 20;
                            break;

                        case 1:
                            score *= 4;
                            break;

                        case 2:
                            score *= 1;
                            break;

                        case 3:
                            score /= 2;
                            break;

                        default:
                            score /= i;
                            break;
                        }
                    }

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }

                    bool exits = drp.pDocumentResult != null;

                    if (!exits && i > 0)
                    {
                        exits = docIdRank.TryGetValue(docList.DocumentId, out drp);
                    }

                    if (exits)
                    {
                        drp.pDocumentResult->Score += score;
                        drp.pDocumentResult->HitCount++;

                        double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition;
                        double positionDelta      = docList.FirstPosition - drp.pDocumentResult->LastPosition;

                        double delta = Math.Abs(queryPositionDelta - positionDelta);

                        if (delta < 0.031)
                        {
                            delta = 0.031;
                        }
                        else if (delta <= 1.1)
                        {
                            delta = 0.5;
                        }
                        else if (delta <= 2.1)
                        {
                            delta = 1;
                        }

                        delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount /
                                (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount);

                        drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta);

                        //Overflow, if match too much, sometime score would less than zero.
                        if (drp.pDocumentResult->Score < 0)
                        {
                            drp.pDocumentResult->Score = long.MaxValue - 4000000;
                        }

                        drp.pDocumentResult->LastIndex    = (UInt16)i;
                        drp.pDocumentResult->LastPosition = docList.FirstPosition;
                        drp.pDocumentResult->LastCount    = (UInt16)docList.Count;
                        drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition;
                    }
                    else
                    {
                        bool notInDict = false;

                        if (_NotInDict != null)
                        {
                            if (_NotInDict.ContainsKey(docList.DocumentId))
                            {
                                notInDict = true;
                            }
                        }

                        if (!notInDict)
                        {
                            //upDict is null in this function
                            DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                            docIdRank.Add(docList.DocumentId, docResult);
                        }
                    }

                    docList = wifq.WordIndex.GetNext();
                    j++;

                    if (j > wifq.WordIndex.Count)
                    {
                        break;
                    }
                }
            }

            long maxScoreValue  = 0; //Max score value of the docid that hit count less than wordIndexes.Length
            int  wordIndexesLen = wordIndexes.Length;

            //Get the max score value of the docs that hit count less than wordIndexes.Length
            foreach (DocumentResultPoint docResult in docIdRank.Values)
            {
                if (docResult.pDocumentResult->HitCount < wordIndexesLen)
                {
                    if (docResult.pDocumentResult->Score > maxScoreValue)
                    {
                        maxScoreValue = docResult.pDocumentResult->Score;
                    }
                }
            }

            double hitRate = 0;

            if (indexInTop < wordIndexes.Length - 1)
            {
                int[] docidlist = new int[docIdRank.Count];

                int i = 0;
                foreach (int docid in docIdRank.Keys)
                {
                    docidlist[i] = docid;
                    i++;
                }

                Array.Sort(docidlist);

                int lastWordHitCount = 0;

                foreach (int firstDocId in docidlist)
                {
                    int curWord = indexInTop + 1;

                    Core.SFQL.Parse.DocumentResultPoint drp;

                    if (docIdRank.TryGetValue(firstDocId, out drp))
                    {
                        while (curWord < wordIndexesLen)
                        {
                            Entity.DocumentPositionList docList = wordIndexes[curWord].WordIndex.Get(firstDocId);
                            int curDocId = docList.DocumentId;

                            if (curDocId >= 0)
                            {
                                drp.pDocumentResult->HitCount++;

                                if (curWord == wordIndexesLen - 1)
                                {
                                    lastWordHitCount++;
                                }

                                WordIndexForQuery wifq = wordIndexes[curWord];

                                long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                                if (score < 0)
                                {
                                    //Overflow
                                    score = long.MaxValue - 4000000;
                                }
                                else
                                {
                                    switch (curWord)
                                    {
                                    case 0:
                                        score *= 20;
                                        break;

                                    case 1:
                                        score *= 4;
                                        break;

                                    case 2:
                                        score *= 1;
                                        break;

                                    case 3:
                                        score /= 2;
                                        break;

                                    default:
                                        score /= curWord;
                                        break;
                                    }
                                }

                                if (score < 0)
                                {
                                    //Overflow
                                    score = long.MaxValue - 4000000;
                                }

                                drp.pDocumentResult->Score += score;

                                double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition;
                                double positionDelta      = docList.FirstPosition - drp.pDocumentResult->LastPosition;

                                double delta = Math.Abs(queryPositionDelta - positionDelta);

                                if (delta < 0.031)
                                {
                                    delta = 0.031;
                                }
                                else if (delta <= 1.1)
                                {
                                    delta = 0.5;
                                }
                                else if (delta <= 2.1)
                                {
                                    delta = 1;
                                }

                                delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount /
                                        (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount);

                                drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta);



                                //Overflow, if match too much, sometime score would less than zero.
                                if (drp.pDocumentResult->Score < 0)
                                {
                                    drp.pDocumentResult->Score = long.MaxValue - 4000000;
                                }

                                drp.pDocumentResult->LastIndex    = (UInt16)curWord;
                                drp.pDocumentResult->LastPosition = docList.FirstPosition;
                                drp.pDocumentResult->LastCount    = (UInt16)docList.Count;
                                drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition;
                            }
                            curWord++;
                        } //While

                        if (drp.pDocumentResult->HitCount < wordIndexesLen)
                        {
                            if (drp.pDocumentResult->Score > maxScoreValue)
                            {
                                maxScoreValue = drp.pDocumentResult->Score;
                            }
                        }
                    }
                }

                if (docidlist.Length > 0)
                {
                    hitRate = (double)lastWordHitCount / (double)docidlist.Length;
                }
            }

            //Adjust score of the docs that hit count equal wordIndexes.Length
            foreach (DocumentResultPoint docResult in docIdRank.Values)
            {
                if (docResult.pDocumentResult->HitCount == wordIndexesLen)
                {
                    docResult.pDocumentResult->Score += maxScoreValue;

                    if (docResult.pDocumentResult->Score < 0)
                    {
                        docResult.pDocumentResult->Score = long.MaxValue;
                    }
                }
            }

            performanceReport.Stop();

            documentSum += docIdRank.Count;

            if (indexInTop < wordIndexes.Length - 1)
            {
                documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount;

                if (hitRate > 0)
                {
                    int predictCount = 0;

                    for (int i = indexInTop + 1; i < wordIndexes.Length - 1; i++)
                    {
                        predictCount += (int)(wordIndexes[i].RelTotalCount * (1 - hitRate));
                    }

                    documentSum += predictCount;
                }
            }

            if (documentSum > _TotalDocuments)
            {
                documentSum = _TotalDocuments;
            }

            docIdRank.RelTotalCount = documentSum;

            DeleteProvider delProvider = _DBProvider.DelProvider;
            int            deleteCount = delProvider.Filter(docIdRank);

            docIdRank.RelTotalCount -= deleteCount;

            if (groupbyScanAll)
            {
                docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count;
            }
            else if (docIdRank.GroupByCollection.Count > docIdRank.RelTotalCount)
            {
                docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count;
            }
        }