예제 #1
0
        public Core.SFQL.Parse.DocumentResultWhereDictionary Search()
        {
            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Search of Match1");

            Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary();

            if (_QueryWords.Count <= 0 || _WordIndexes.Length <= 0)
            {
                if (_QueryParameter.Not && UpDict != null)
                {
                    return(UpDict);
                }
                else
                {
                    return(result);
                }
            }

            if (this._QueryParameter.Not)
            {
                if (_InvertedIndex.IndexMode == Field.IndexMode.Simple)
                {
                    Calculate(null, ref result, _WordIndexes);
                }
                else
                {
                    CalculateWithPosition(null, ref result, _WordIndexes);
                }
            }
            else
            {
                if (_InvertedIndex.IndexMode == Field.IndexMode.Simple)
                {
                    Calculate(this.UpDict, ref result, _WordIndexes);
                }
                else
                {
                    CalculateWithPosition(this.UpDict, ref result, _WordIndexes);
                }
            }

            if (this._QueryParameter.Not)
            {
                result.Not = true;

                if (UpDict != null)
                {
                    result = result.AndMergeForNot(result, UpDict);
                }
            }

            performanceReport.Stop();

            return(result);
        }
예제 #2
0
 unsafe public void CalculateOptimize(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                      ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank)
 {
     if (Argument.IsOrderByScoreDesc())
     {
         CalculateOptimizeOrderByScoreDesc(upDict, ref docIdRank);
     }
     else
     {
         CalculateOptimizeNormalOrderBy(upDict, ref docIdRank);
     }
 }
예제 #3
0
        public Core.SFQL.Parse.DocumentResultWhereDictionary Search()
        {
            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Search of Contains");

            List <WordIndexForQuery[]> partList = GetAllPartOfWordIndexes();

            if (_QueryWords.Count <= 0 || partList.Count <= 0)
            {
                return(PartSearch(new WordIndexForQuery[0]));
            }


            Core.SFQL.Parse.DocumentResultWhereDictionary result = PartSearch(partList[0]);

            for (int i = 1; i < partList.Count; i++)
            {
                bool someWordNoResult = false;

                foreach (WordIndexForQuery w in partList[i])
                {
                    if (w.WordIndex.WordCount == 0)
                    {
                        someWordNoResult = true;
                        break;
                    }

                    w.WordIndex.Reset();
                }

                if (!someWordNoResult)
                {
                    result.OrMerge(result, PartSearch(partList[i]));
                }
            }

            if (this._QueryParameter.Not)
            {
                result.Not = true;

                if (UpDict != null)
                {
                    result = result.AndMergeForNot(result, UpDict);
                }
            }

            performanceReport.Stop();

            return(result);
        }
예제 #4
0
        /// <summary>
        /// And merge when the Not property is false both of fst and sec
        /// </summary>
        /// <param name="and"></param>
        /// <param name="or"></param>
        /// <returns></returns>
        unsafe public Core.SFQL.Parse.DocumentResultWhereDictionary AndMergeDict(Core.SFQL.Parse.DocumentResultWhereDictionary fst, Core.SFQL.Parse.DocumentResultWhereDictionary sec)
        {
            if (fst == null)
            {
                return(new Core.SFQL.Parse.DocumentResultWhereDictionary());
            }

            if (sec == null)
            {
                return(new DocumentResultWhereDictionary());
            }

            Core.SFQL.Parse.DocumentResultWhereDictionary src;
            Core.SFQL.Parse.DocumentResultWhereDictionary dest;

            if (fst.Count > sec.Count)
            {
                src  = sec;
                dest = fst;
            }
            else
            {
                src  = fst;
                dest = sec;
            }


            Core.SFQL.Parse.DocumentResultWhereDictionary result = new DocumentResultWhereDictionary();
            result.Not = dest.Not;

            foreach (Core.SFQL.Parse.DocumentResultPoint drp in src.Values)
            {
                Query.DocumentResult *dr;
                if (dest.TryGetValue(drp.pDocumentResult->DocId, out dr))
                {
                    dr->Score += drp.pDocumentResult->Score;
                    if (dr->PayloadData == null && drp.pDocumentResult->PayloadData != null)
                    {
                        dr->PayloadData = drp.pDocumentResult->PayloadData;
                    }

                    result.Add(drp.pDocumentResult->DocId, *dr);
                }
            }

            return(result);
        }
예제 #5
0
        private Core.SFQL.Parse.DocumentResultWhereDictionary PartSearch(WordIndexForQuery[] wordIndexes)
        {
            Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary();

            if (_QueryWords.Count <= 0 || wordIndexes.Length <= 0)
            {
                if (_QueryParameter.Not && UpDict != null)
                {
                    return(UpDict);
                }
                else
                {
                    return(result);
                }
            }

            if (this._QueryParameter.Not)
            {
                if (_InvertedIndex.IndexMode == Field.IndexMode.Simple)
                {
                    Calculate(null, ref result, wordIndexes);
                }
                else
                {
                    CalculateWithPosition(null, ref result, wordIndexes);
                }
            }
            else
            {
                if (_InvertedIndex.IndexMode == Field.IndexMode.Simple)
                {
                    Calculate(this.UpDict, ref result, wordIndexes);
                }
                else
                {
                    CalculateWithPosition(this.UpDict, ref result, wordIndexes);
                }
            }

            return(result);
        }
예제 #6
0
        unsafe private void Calculate(DocumentResultWhereDictionary upDict,
                                      ref DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            Array.Sort(wordIndexes);

            MinResultCount = _DBProvider.Table.GroupByLimit;

            //Get max word doc list count
            int minWordDocListCount = 1 * 1024 * 1024; //1M

            foreach (WordIndexForQuery wifq in wordIndexes)
            {
                minWordDocListCount = Math.Min(minWordDocListCount, wifq.WordIndex.WordDocList.Count);
            }

            if (docIdRank.Count == 0)
            {
                if (minWordDocListCount > DocumentResultWhereDictionary.DefaultSize)
                {
                    docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(minWordDocListCount);
                }
            }

            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate");

            //Merge
            bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1;
            int  oneWordMaxCount = 0;

            if (oneWordOptimize)
            {
                //One word
                WordIndexForQuery wifq = wordIndexes[0]; //first word

                //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf;

                Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();
                int j = 0;

                while (docList.DocumentId >= 0)
                {
                    //Entity.DocumentPositionList docList = wifq.WordIndex[j];

                    Core.SFQL.Parse.DocumentResultPoint drp;
                    drp.pDocumentResult = null;

                    if (j > MinResultCount)
                    {
                        if (oneWordMaxCount > docList.Count)
                        {
                            j++;
                            docList = wifq.WordIndex.GetNext();

                            continue;
                        }
                    }
                    else
                    {
                        if (oneWordMaxCount < docList.Count)
                        {
                            oneWordMaxCount = docList.Count;
                        }
                    }

                    long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }

                    if (upDict == null)
                    {
                        docIdRank.Add(docList.DocumentId, score);
                    }
                    else
                    {
                        if (!upDict.Not)
                        {
                            if (upDict.ContainsKey(docList.DocumentId))
                            {
                                docIdRank.Add(docList.DocumentId, score);
                            }
                        }
                        else
                        {
                            if (!upDict.ContainsKey(docList.DocumentId))
                            {
                                docIdRank.Add(docList.DocumentId, score);
                            }
                        }
                    }

                    j++;
                    docList = wifq.WordIndex.GetNext();
                }
            }
            else
            {
                int wordIndexesLen = wordIndexes.Length;

                WordIndexForQuery fstWifq = wordIndexes[0]; //first word

                Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext();

                Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen];

                docListArr[0] = fstDocList;

                while (fstDocList.DocumentId >= 0)
                {
                    int curWord    = 1;
                    int firstDocId = fstDocList.DocumentId;

                    while (curWord < wordIndexesLen)
                    {
                        docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId);

                        if (docListArr[curWord].DocumentId < 0)
                        {
                            break;
                        }

                        curWord++;
                    } //While

                    if (curWord >= wordIndexesLen)
                    {
                        //Matched

                        long totalScore = 0;
                        for (int i = 0; i < wordIndexesLen; i++)
                        {
                            WordIndexForQuery           wifq    = wordIndexes[i];
                            Entity.DocumentPositionList docList = docListArr[i];

                            long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                            if (score < 0)
                            {
                                //Overflow
                                score = long.MaxValue - 4000000;
                            }

                            totalScore += score;
                        }

                        if (upDict == null)
                        {
                            docIdRank.Add(firstDocId, totalScore);
                        }
                        else
                        {
                            if (!upDict.Not)
                            {
                                if (upDict.ContainsKey(firstDocId))
                                {
                                    docIdRank.Add(firstDocId, totalScore);
                                }
                            }
                            else
                            {
                                if (!upDict.ContainsKey(firstDocId))
                                {
                                    docIdRank.Add(firstDocId, totalScore);
                                }
                            }
                        }
                    }

                    fstDocList    = fstWifq.WordIndex.GetNext();
                    docListArr[0] = fstDocList;
                }
            }

            //Merge score if upDict != null
            if (upDict != null)
            {
                if (!upDict.Not)
                {
                    foreach (int docid in docIdRank.Keys)
                    {
                        DocumentResult *upDrp;

                        if (upDict.TryGetValue(docid, out upDrp))
                        {
                            DocumentResult *drpResult;
                            if (docIdRank.TryGetValue(docid, out drpResult))
                            {
                                drpResult->Score += upDrp->Score;
                            }
                        }
                    }
                }
            }

            DeleteProvider delProvider = _DBProvider.DelProvider;
            int            delCount    = delProvider.Filter(docIdRank);

            if (oneWordOptimize && _QueryParameter.CanLoadPartOfDocs && upDict == null)
            {
                docIdRank.RelTotalCount = wordIndexes[0].RelTotalCount - delCount;
            }
            else
            {
                docIdRank.RelTotalCount = docIdRank.Count;
            }

            performanceReport.Stop();
        }
예제 #7
0
        unsafe private void CalculateWithPosition(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                                  ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            if (UseMatch(wordIndexes))
            {
                CalculateWithPositionMatch(upDict, ref docIdRank, wordIndexes);
                return;
            }

            Array.Sort(wordIndexes);

            AdjustSort(wordIndexes);

            MinResultCount = _DBProvider.Table.GroupByLimit;

            //Get max word doc list count
            int minWordDocListCount = 1 * 1024 * 1024; //1M

            foreach (WordIndexForQuery wifq in wordIndexes)
            {
                minWordDocListCount = Math.Min(minWordDocListCount, wifq.WordIndex.Count);
            }


            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate");

            //Merge
            bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) &&
                                   wordIndexes.Length == 1 && _NotInDict == null && _QueryParameter.End >= 0 && !_QueryParameter.NeedDistinct;

            if (oneWordOptimize)
            {
                IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(OneWordOptimize),
                                                                      DBProvider, _QueryParameter.End, _QueryParameter.OrderBy,
                                                                      _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy,
                                                                      _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider),
                                                                      _QueryParameter.UntokenizedTreeOnRoot, wordIndexes);

                try
                {
                    qOptimize.CalculateOptimize(upDict, ref docIdRank);
                    return;
                }
                finally
                {
                    performanceReport.Stop();
                }
            }

            if (this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) &&
                _NotInDict == null && _QueryParameter.End >= 0 && !_QueryParameter.NeedDistinct)
            {
                IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(ContainsOptimize),
                                                                      DBProvider, _QueryParameter.End, _QueryParameter.OrderBy,
                                                                      _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy,
                                                                      _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider),
                                                                      _QueryParameter.UntokenizedTreeOnRoot, wordIndexes);

                try
                {
                    qOptimize.CalculateOptimize(upDict, ref docIdRank);
                    return;
                }
                finally
                {
                    performanceReport.Stop();
                }

                //if (qOptimize.Argument.IsOrderByScoreDesc())
                //{

                //}
            }

            if (docIdRank.Count == 0)
            {
                if (minWordDocListCount > DocumentResultWhereDictionary.DefaultSize)
                {
                    docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(minWordDocListCount);
                }
            }

            {
                double ratio = 1;

                if (wordIndexes.Length > 1)
                {
                    ratio = (double)2 / (double)(wordIndexes.Length - 1);
                }

                int wordIndexesLen = wordIndexes.Length;

                WordIndexForQuery fstWifq = wordIndexes[0]; //first word

                OriginalDocumentPositionList fstODPL = new OriginalDocumentPositionList();
                fstWifq.WordIndex.GetNextOriginal(ref fstODPL);

                //Entity.DocumentPositionList fstDocList = fstWifq.WordIndex.GetNext();

                Entity.DocumentPositionList[] docListArr = new Hubble.Core.Entity.DocumentPositionList[wordIndexesLen];

                //docListArr[0] = fstDocList;
                fstODPL.ToDocumentPositionList(ref docListArr[0]);

                OriginalDocumentPositionList odpl = new OriginalDocumentPositionList();

                while (fstODPL.DocumentId >= 0)
                {
                    int curWord    = 1;
                    int firstDocId = fstODPL.DocumentId;

                    while (curWord < wordIndexesLen)
                    {
                        //docListArr[curWord] = wordIndexes[curWord].WordIndex.Get(firstDocId);

                        wordIndexes[curWord].WordIndex.GetNextOriginalWithDocId(ref odpl, firstDocId);
                        odpl.ToDocumentPositionList(ref docListArr[curWord]);

                        if (docListArr[curWord].DocumentId < 0)
                        {
                            if ((wordIndexes[curWord].Flags & WordInfo.Flag.Or) != 0)
                            {
                                curWord++;
                                continue;
                            }
                            else
                            {
                                break;
                            }
                        }

                        curWord++;
                    } //While

                    if (curWord >= wordIndexesLen)
                    {
                        //Matched
                        //Caculate score

                        long totalScore = 0;
                        Entity.DocumentPositionList lastDocList
                            = new Hubble.Core.Entity.DocumentPositionList();

                        for (int i = 0; i < wordIndexesLen; i++)
                        {
                            WordIndexForQuery wifq = wordIndexes[i];

                            if (wifq.WordIndex.Count == 0)
                            {
                                //a^5000^0 b^5000^2^1
                                //if has a and hasn't b but b can be or
                                //2010-09-30 eaglet
                                continue;
                            }

                            Entity.DocumentPositionList docList = docListArr[i];


                            long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                            if (score < 0)
                            {
                                //Overflow
                                score = long.MaxValue - 4000000;
                            }

                            double delta = 1;

                            if (i > 0)
                            {
                                //Calculate with position
                                double queryPositionDelta = wifq.FirstPosition - wordIndexes[i - 1].FirstPosition;
                                double positionDelta      = docList.FirstPosition - lastDocList.FirstPosition;

                                delta = Math.Abs(queryPositionDelta - positionDelta);

                                if (delta < 0.031)
                                {
                                    delta = 0.031;
                                }
                                else if (delta <= 1.1)
                                {
                                    delta = 0.5;
                                }
                                else if (delta <= 2.1)
                                {
                                    delta = 1;
                                }

                                delta = Math.Pow((1 / delta), ratio) * docList.Count * lastDocList.Count /
                                        (double)(wifq.QueryCount * wordIndexes[i - 1].QueryCount);
                            }

                            lastDocList = docList;

                            totalScore += (long)(score * delta);
                        }

                        bool notInDict = false;

                        if (_NotInDict != null)
                        {
                            if (_NotInDict.ContainsKey(firstDocId))
                            {
                                notInDict = true;
                            }
                        }

                        if (!notInDict)
                        {
                            if (upDict == null)
                            {
                                docIdRank.Add(firstDocId, totalScore);
                            }
                            else
                            {
                                if (!upDict.Not)
                                {
                                    if (upDict.ContainsKey(firstDocId))
                                    {
                                        docIdRank.Add(firstDocId, totalScore);
                                    }
                                }
                                else
                                {
                                    if (!upDict.ContainsKey(firstDocId))
                                    {
                                        docIdRank.Add(firstDocId, totalScore);
                                    }
                                }
                            }
                        }
                    }//if (curWord >= wordIndexesLen)

                    //fstDocList = fstWifq.WordIndex.GetNext();
                    //docListArr[0] = fstDocList;

                    fstWifq.WordIndex.GetNextOriginal(ref fstODPL);
                    fstODPL.ToDocumentPositionList(ref docListArr[0]);
                }
            }

            //Merge score if upDict != null
            if (upDict != null)
            {
                if (!upDict.Not)
                {
                    foreach (int docid in docIdRank.Keys)
                    {
                        DocumentResult *upDrp;

                        if (upDict.TryGetValue(docid, out upDrp))
                        {
                            DocumentResult *drpResult;
                            if (docIdRank.TryGetValue(docid, out drpResult))
                            {
                                drpResult->Score += upDrp->Score;
                            }
                        }
                    }
                }
            }

            DeleteProvider delProvider = _DBProvider.DelProvider;
            int            delCount    = delProvider.Filter(docIdRank);

            if (oneWordOptimize && _QueryParameter.CanLoadPartOfDocs && upDict == null)
            {
                docIdRank.RelTotalCount = wordIndexes[0].RelTotalCount - delCount;
            }
            else
            {
                docIdRank.RelTotalCount = docIdRank.Count;
            }

            performanceReport.Stop();
        }
예제 #8
0
        unsafe private void CalculateWithPositionMatch(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                                       ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            Array.Sort(wordIndexes);

            MinResultCount = _DBProvider.Table.GroupByLimit;

            double ratio = 1;

            if (wordIndexes.Length > 1)
            {
                ratio = (double)2 / (double)(wordIndexes.Length - 1);
            }

            //Get max word doc list count
            int maxWordDocListCount = 0;
            int documentSum         = 0;

            foreach (WordIndexForQuery wifq in wordIndexes)
            {
                maxWordDocListCount += wifq.WordIndex.Count;
            }

            maxWordDocListCount += maxWordDocListCount / 2;

            if (maxWordDocListCount > 1024 * 1024)
            {
                maxWordDocListCount = 1024 * 1024;
            }

            if (docIdRank.Count == 0)
            {
                if (maxWordDocListCount > DocumentResultWhereDictionary.DefaultSize)
                {
                    docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(maxWordDocListCount);
                }
            }

            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate");

            //Merge
            bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1;

            for (int i = 0; i < wordIndexes.Length; i++)
            {
                WordIndexForQuery wifq = wordIndexes[i];

                //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf;

                Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();
                int j = 0;
                int oneWordMaxCount = 0;

                while (docList.DocumentId >= 0)
                {
                    Core.SFQL.Parse.DocumentResultPoint drp;
                    drp.pDocumentResult = null;

                    if (oneWordOptimize)
                    {
                        if (j > MinResultCount)
                        {
                            if (oneWordMaxCount > docList.Count)
                            {
                                docList = wifq.WordIndex.GetNext();
                                j++;

                                continue;
                            }
                        }
                        else
                        {
                            if (oneWordMaxCount < docList.Count)
                            {
                                oneWordMaxCount = docList.Count;
                            }
                        }
                    }

                    if (j > wifq.RelTotalCount)
                    {
                        break;
                    }

                    long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }

                    bool exits = drp.pDocumentResult != null;

                    if (!exits && i > 0)
                    {
                        exits = docIdRank.TryGetValue(docList.DocumentId, out drp);
                    }

                    if (exits)
                    {
                        drp.pDocumentResult->Score += score;

                        double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition;
                        double positionDelta      = docList.FirstPosition - drp.pDocumentResult->LastPosition;

                        double delta = Math.Abs(queryPositionDelta - positionDelta);

                        if (delta < 0.031)
                        {
                            delta = 0.031;
                        }
                        else if (delta <= 1.1)
                        {
                            delta = 0.5;
                        }
                        else if (delta <= 2.1)
                        {
                            delta = 1;
                        }

                        delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount /
                                (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount);

                        //some words missed
                        //if (i - drp.pDocumentResult->LastIndex > 1)
                        //{
                        //    int sumWordRank = 10;
                        //    for (int k = drp.pDocumentResult->LastIndex + 1; k < i; k++)
                        //    {
                        //        sumWordRank += wordIndexes[k].WordRank;
                        //    }

                        //    delta /= (double)sumWordRank;
                        //}

                        drp.pDocumentResult->Score        = (long)(drp.pDocumentResult->Score * delta);
                        drp.pDocumentResult->LastIndex    = (UInt16)i;
                        drp.pDocumentResult->LastPosition = docList.FirstPosition;
                        drp.pDocumentResult->LastCount    = (UInt16)docList.Count;
                        drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition;
                    }
                    else
                    {
                        //some words missed
                        //if (i > 0)
                        //{
                        //    int sumWordRank = 10;
                        //    for (int k = 0; k < i; k++)
                        //    {
                        //        sumWordRank += wordIndexes[k].WordRank;
                        //    }

                        //    double delta = 1 / (double)sumWordRank;
                        //    score = (long)(score * delta);
                        //}

                        bool notInDict = false;

                        if (_NotInDict != null)
                        {
                            if (_NotInDict.ContainsKey(docList.DocumentId))
                            {
                                notInDict = true;
                            }
                        }

                        if (!notInDict)
                        {
                            if (upDict == null)
                            {
                                DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                                docIdRank.Add(docList.DocumentId, docResult);
                            }
                            else
                            {
                                if (!upDict.Not)
                                {
                                    if (upDict.ContainsKey(docList.DocumentId))
                                    {
                                        DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                                        docIdRank.Add(docList.DocumentId, docResult);
                                    }
                                }
                                else
                                {
                                    if (!upDict.ContainsKey(docList.DocumentId))
                                    {
                                        DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                                        docIdRank.Add(docList.DocumentId, docResult);
                                    }
                                }
                            }
                        }
                    }

                    docList = wifq.WordIndex.GetNext();
                    j++;

                    if (j > wifq.WordIndex.Count)
                    {
                        break;
                    }
                }
            }

            //Merge score if upDict != null
            if (upDict != null)
            {
                if (!upDict.Not)
                {
                    foreach (int docid in docIdRank.Keys)
                    {
                        DocumentResult *upDrp;

                        if (upDict.TryGetValue(docid, out upDrp))
                        {
                            DocumentResult *drpResult;
                            if (docIdRank.TryGetValue(docid, out drpResult))
                            {
                                drpResult->Score += upDrp->Score;
                            }
                        }
                    }
                }
            }

            //some words missed
            //if (wordIndexes.Length > 1)
            //{
            //    List<DocumentResult> reduceDocs = new List<DocumentResult>(docIdRank.Count);
            //    int lstIndex = wordIndexes.Length - 1;
            //    foreach (Core.SFQL.Parse.DocumentResultPoint drp in docIdRank.Values)
            //    {
            //        DocumentResult* dr = drp.pDocumentResult;
            //        //DocumentResult* dr1 = drp.pDocumentResult;
            //        if (dr->LastIndex != lstIndex)
            //        {
            //            int sumWordRank = 10;
            //            for (int k = dr->LastIndex + 1; k <= lstIndex; k++)
            //            {
            //                sumWordRank += wordIndexes[k].WordRank;
            //            }

            //            double delta = 1 / (double)sumWordRank;

            //            dr->Score = (long)((double)dr->Score * delta);
            //        }

            //        if (dr->Score < 0)
            //        {
            //            dr->Score = long.MaxValue / 10;
            //        }
            //    }
            //}

            performanceReport.Stop();

            documentSum += docIdRank.Count;

            if (documentSum > _TotalDocuments)
            {
                documentSum = _TotalDocuments;
            }

            DeleteProvider delProvider = _DBProvider.DelProvider;
            int            deleteCount = delProvider.Filter(docIdRank);

            if (_QueryParameter.CanLoadPartOfDocs && upDict == null)
            {
                if (docIdRank.Count < wordIndexes[wordIndexes.Length - 1].RelTotalCount)
                {
                    if (wordIndexes.Length > 1)
                    {
                        if (wordIndexes[wordIndexes.Length - 1].RelTotalCount > _DBProvider.MaxReturnCount)
                        {
                            documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount - _DBProvider.MaxReturnCount;
                        }

                        if (documentSum > _TotalDocuments)
                        {
                            documentSum = _TotalDocuments;
                        }

                        docIdRank.RelTotalCount = documentSum;
                    }
                    else
                    {
                        docIdRank.RelTotalCount = wordIndexes[wordIndexes.Length - 1].RelTotalCount;
                    }
                }
            }

            docIdRank.RelTotalCount -= deleteCount;
        }
예제 #9
0
        public Core.SFQL.Parse.DocumentResultWhereDictionary GetDocumentResults(int end, string where, string orderby)
        {
            string sql;

            if (end >= 0)
            {
                sql = string.Format("select top {0} ", end + 1);
            }
            else
            {
                sql = "select ";
            }

            if (string.IsNullOrEmpty(where))
            {
                if (DocIdReplaceField == null)
                {
                    sql += string.Format(" docid from [{0}] ", Table.DBTableName);
                }
                else
                {
                    sql += string.Format(" [{0}] from [{1}] ", DocIdReplaceField, Table.DBTableName);
                }
            }
            else
            {
                if (DocIdReplaceField == null)
                {
                    sql += string.Format(" docid from [{0}] where {1}", Table.DBTableName, where);
                }
                else
                {
                    sql += string.Format(" [{0}] from [{1}] where {2}", DocIdReplaceField, Table.DBTableName, where);
                }
            }

            if (!string.IsNullOrEmpty(orderby))
            {
                sql += " order by " + orderby;
            }

            Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary();

            using (SQLDataProvider sqlData = new SQLDataProvider())
            {
                sqlData.Connect(Table.ConnectionString);
                foreach (System.Data.DataRow row in sqlData.QuerySql(sql).Tables[0].Rows)
                {
                    int docId;
                    if (DocIdReplaceField == null)
                    {
                        docId = int.Parse(row[0].ToString());
                    }
                    else
                    {
                        docId = DBProvider.GetDocIdFromDocIdReplaceFieldValue(long.Parse(row[DocIdReplaceField].ToString()));

                        if (docId < 0)
                        {
                            continue;
                        }
                    }

                    result.Add(docId, new Hubble.Core.Query.DocumentResult(docId));
                }

                System.Data.DataSet ds;

                if (string.IsNullOrEmpty(where))
                {
                    ds = sqlData.QuerySql(string.Format("select count(*) cnt from {0}",
                                                        Table.DBTableName));
                }
                else
                {
                    ds = sqlData.QuerySql(string.Format("select count(*) cnt from {0} where {1}",
                                                        Table.DBTableName, where));
                }

                result.RelTotalCount = int.Parse(ds.Tables[0].Rows[0][0].ToString());
            }

            return(result);
        }
예제 #10
0
        unsafe private void Calculate(DocumentResultWhereDictionary upDict,
                                      ref DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            Array.Sort(wordIndexes);

            MinResultCount = _DBProvider.Table.GroupByLimit;

            //Get max word doc list count
            int maxWordDocListCount = 0;
            int documentSum         = 0;

            foreach (WordIndexForQuery wifq in wordIndexes)
            {
                maxWordDocListCount += wifq.WordIndex.RelDocCount;
            }

            if (docIdRank.Count == 0)
            {
                if (maxWordDocListCount > DocumentResultWhereDictionary.DefaultSize)
                {
                    docIdRank = new Core.SFQL.Parse.DocumentResultWhereDictionary(maxWordDocListCount);
                }
            }

            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate");

            //Merge
            bool oneWordOptimize = this._QueryParameter.CanLoadPartOfDocs && this._QueryParameter.NoAndExpression && wordIndexes.Length == 1;

            for (int i = 0; i < wordIndexes.Length; i++)
            {
                WordIndexForQuery wifq = wordIndexes[i];

                //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf;

                Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();
                int j = 0;
                int oneWordMaxCount = 0;

                while (docList.DocumentId >= 0)
                {
                    //Entity.DocumentPositionList docList = wifq.WordIndex[j];

                    Core.SFQL.Parse.DocumentResultPoint drp;
                    drp.pDocumentResult = null;

                    if (oneWordOptimize)
                    {
                        if (j > MinResultCount)
                        {
                            if (j > MinResultCount)
                            {
                                if (oneWordMaxCount > docList.Count)
                                {
                                    docList = wifq.WordIndex.GetNext();
                                    j++;

                                    continue;
                                }
                            }
                            else
                            {
                                if (oneWordMaxCount < docList.Count)
                                {
                                    oneWordMaxCount = docList.Count;
                                }
                            }
                        }
                    }


                    long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }

                    bool exits = drp.pDocumentResult != null;

                    if (!exits && i > 0)
                    {
                        exits = docIdRank.TryGetValue(docList.DocumentId, out drp);
                    }

                    if (exits)
                    {
                        drp.pDocumentResult->Score += score;
                    }
                    else
                    {
                        bool notInDict = false;

                        if (_NotInDict != null)
                        {
                            if (_NotInDict.ContainsKey(docList.DocumentId))
                            {
                                notInDict = true;
                            }
                        }

                        if (!notInDict)
                        {
                            if (upDict == null)
                            {
                                docIdRank.Add(docList.DocumentId, score);
                            }
                            else
                            {
                                if (!upDict.Not)
                                {
                                    if (upDict.ContainsKey(docList.DocumentId))
                                    {
                                        docIdRank.Add(docList.DocumentId, score);
                                    }
                                }
                                else
                                {
                                    if (!upDict.ContainsKey(docList.DocumentId))
                                    {
                                        docIdRank.Add(docList.DocumentId, score);
                                    }
                                }
                            }
                        }
                    }

                    docList = wifq.WordIndex.GetNext();
                    j++;
                }
            }

            //Merge score if upDict != null
            if (upDict != null)
            {
                if (!upDict.Not)
                {
                    foreach (int docid in docIdRank.Keys)
                    {
                        DocumentResult *upDrp;

                        if (upDict.TryGetValue(docid, out upDrp))
                        {
                            DocumentResult *drpResult;
                            if (docIdRank.TryGetValue(docid, out drpResult))
                            {
                                drpResult->Score += upDrp->Score;
                            }
                        }
                    }
                }
            }

            documentSum += docIdRank.Count;

            if (documentSum > _TotalDocuments)
            {
                documentSum = _TotalDocuments;
            }

            DeleteProvider delProvider = _DBProvider.DelProvider;
            int            deleteCount = delProvider.Filter(docIdRank);

            if (_QueryParameter.CanLoadPartOfDocs && upDict == null)
            {
                if (docIdRank.Count < wordIndexes[wordIndexes.Length - 1].RelTotalCount)
                {
                    if (wordIndexes.Length > 1)
                    {
                        if (wordIndexes[wordIndexes.Length - 1].RelTotalCount > _DBProvider.MaxReturnCount)
                        {
                            documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount - _DBProvider.MaxReturnCount;
                        }

                        if (documentSum > _TotalDocuments)
                        {
                            documentSum = _TotalDocuments;
                        }

                        docIdRank.RelTotalCount = documentSum;
                    }
                    else
                    {
                        docIdRank.RelTotalCount = wordIndexes[wordIndexes.Length - 1].RelTotalCount;
                    }
                }
            }

            docIdRank.RelTotalCount -= deleteCount;

            performanceReport.Stop();
        }
예제 #11
0
        unsafe private void CalculateWithPositionOrderByScoreDesc(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                                                  ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            DBProvider dbProvider = Argument.DBProvider;

            bool           needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions;
            ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot;

            if (upDict != null)
            {
                throw new ParseException("UpDict is not null!");
            }

            //Calculate top
            int top;

            if (Argument.End >= 0)
            {
                top = (1 + Argument.End / 100) * 100;

                if (top <= 0)
                {
                    top = 100;
                }
            }
            else
            {
                top = int.MaxValue;
            }

            PriorQueue <Docid2Long> priorQueue     = null;
            List <Docid2Long>       docid2longList = null;

            if (top == int.MaxValue)
            {
                docid2longList = new List <Docid2Long>();
            }
            else
            {
                priorQueue = new PriorQueue <Docid2Long>(top, new DocIdLongComparer(false));
            }

            long lastMinScore = 0;
            int  rows         = 0;

            Core.SFQL.Parse.DocumentResultWhereDictionary groupByDict = Argument.NeedGroupBy ? docIdRank : null;


            MultiWordsDocIdEnumerator mwde = new MultiWordsDocIdEnumerator(wordIndexes, dbProvider, groupByDict, -1,
                                                                           needFilterUntokenizedConditions); //Changed at 2012-3-18, top optimize will effect search result, disable it.

            //MultiWordsDocIdEnumerator mwde = new MultiWordsDocIdEnumerator(wordIndexes, dbProvider, groupByDict, top,
            //    needFilterUntokenizedConditions);

            Entity.OriginalDocumentPositionList odpl = new Hubble.Core.Entity.OriginalDocumentPositionList();

            mwde.GetNextOriginal(ref odpl);

            Entity.DocumentPositionList lastDocList
                = new Hubble.Core.Entity.DocumentPositionList();

            double ratio = 1;

            if (wordIndexes.Length > 1)
            {
                ratio = (double)2 / (double)(wordIndexes.Length - 1);
            }

            Query.DocumentResult  documentResult;
            Query.DocumentResult *drp = &documentResult;
            int skipCount             = 0; //skip by filter untokenized conditions

            while (odpl.DocumentId >= 0)
            {
                //Process untokenized conditions.
                //If is not matched, get the next one.
                if (needFilterUntokenizedConditions)
                {
                    int docId = odpl.DocumentId;
                    drp->DocId       = docId;
                    drp->PayloadData = dbProvider.GetPayloadDataWithShareLock(docId);
                    if (!ParseWhere.GetComparisionExpressionValue(dbProvider, drp,
                                                                  untokenizedTree))
                    {
                        mwde.GetNextOriginal(ref odpl);
                        skipCount++;
                        continue;
                    }
                }

                //Matched
                //Caculate score
                #region Caclate score

                long totalScore = 0;
                lastDocList.Count         = 0;
                lastDocList.FirstPosition = 0;
                int lastWifqIndex = 0;

                for (int i = 0; i < mwde.SelectedCount; i++)
                {
                    int index = mwde.SelectedIndexes[i];

                    WordIndexForQuery wifq = mwde.WordIndexes[index];

                    Int16 count                    = (Int16)mwde.SelectedDocLists[i].Count;
                    int   firstPosition            = mwde.SelectedDocLists[i].FirstPosition;
                    int   totalWordsInThisDocument = mwde.SelectedDocLists[i].TotalWordsInThisDocument;

                    long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)count * (long)1000000 / ((long)wifq.Sum_d_t * (long)totalWordsInThisDocument);

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }

                    double delta = 1;

                    if (i > 0)
                    {
                        //Calculate with position
                        double queryPositionDelta = wifq.FirstPosition - wordIndexes[lastWifqIndex].FirstPosition;
                        double positionDelta      = firstPosition - lastDocList.FirstPosition;

                        delta = Math.Abs(queryPositionDelta - positionDelta);

                        if (delta < 0.031)
                        {
                            delta = 0.031;
                        }
                        else if (delta <= 1.1)
                        {
                            delta = 0.5;
                        }
                        else if (delta <= 2.1)
                        {
                            delta = 1;
                        }

                        delta = Math.Pow((1 / delta), ratio) * count * lastDocList.Count /
                                (double)(wifq.QueryCount * wordIndexes[lastWifqIndex].QueryCount);
                    }

                    lastDocList.Count         = count;
                    lastDocList.FirstPosition = firstPosition;
                    lastWifqIndex             = index;

                    totalScore += (long)(score * delta);
                } //End of score calculation

                if (_HasRankField)
                {
                    int rank = dbProvider.SharedPayloadProvider.GetPayloadRank(odpl.DocumentId);
                    totalScore *= rank;
                    if (totalScore < 0)
                    {
                        totalScore = long.MaxValue - 4000000;
                    }
                }

                //all of the words matched
                //10 times
                if (mwde.SelectedCount == wordIndexes.Length)
                {
                    totalScore *= 10;

                    if (totalScore < 0)
                    {
                        totalScore = long.MaxValue - 4000000;
                    }
                }

                #endregion

                //Insert to prior queue
                if (rows >= top)
                {
                    if (lastMinScore < totalScore)
                    {
                        priorQueue.Add(new Docid2Long(odpl.DocumentId, totalScore));
                        lastMinScore = priorQueue.Last.Value1;
                    }
                }
                else
                {
                    if (top == int.MaxValue)
                    {
                        docid2longList.Add(new Docid2Long(odpl.DocumentId, totalScore));
                    }
                    else
                    {
                        priorQueue.Add(new Docid2Long(odpl.DocumentId, totalScore));
                        rows++;

                        if (rows == top)
                        {
                            lastMinScore = priorQueue.Last.Value1;
                        }
                    }
                }

                mwde.GetNextOriginal(ref odpl);
            }

            docIdRank.RelTotalCount = mwde.TotalDocIdCount - skipCount;

            Docid2Long[] docid2longArr;

            if (top == int.MaxValue)
            {
                docid2longList.Sort(new DocIdLongComparer(false));
                docid2longArr = docid2longList.ToArray();
            }
            else
            {
                docid2longArr = priorQueue.ToArray();
            }

            foreach (Docid2Long docid2Long in docid2longArr)
            {
                long score = docid2Long.Value1;

                if (score < 0)
                {
                    //Overflow
                    score = long.MaxValue - 4000000;
                }

                docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score));
            }

            docIdRank.Sorted = true;
        }
예제 #12
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="docIdResult">docid result dictionary</param>
        /// <returns>Delete count</returns>
        public int Filter(Core.SFQL.Parse.DocumentResultWhereDictionary docIdResult)
        {
            lock (this)
            {
                int deleteCount = 0;

                if (_DeleteTbl.Count <= 0)
                {
                    return(0);
                }

                bool hasGroupByRecords = docIdResult.GroupByCollection.Count > 0;

                if (_DeleteTbl.Count < docIdResult.Count)
                {
                    foreach (int docid in _DeleteTbl.Keys)
                    {
                        if (docIdResult.ContainsKey(docid))
                        {
                            deleteCount++;
                            docIdResult.Remove(docid);
                        }

                        if (hasGroupByRecords)
                        {
                            docIdResult.RemoveFromGroupByCollection(docid);

                            //if (docIdResult.GroupByContains(docid))
                            //{
                            //    docIdResult.RemoveFromGroupByCollection(docid);
                            //}
                        }
                    }

                    if (hasGroupByRecords)
                    {
                        docIdResult.CompreassGroupByCollection();
                    }
                }
                else
                {
                    List <int> deleDocIdList = new List <int>();

                    foreach (int docid in docIdResult.Keys)
                    {
                        if (_DeleteTbl.ContainsKey(docid))
                        {
                            deleDocIdList.Add(docid);
                        }
                    }

                    foreach (int docid in deleDocIdList)
                    {
                        if (docIdResult.Remove(docid))
                        {
                            deleteCount++;
                        }

                        if (hasGroupByRecords)
                        {
                            docIdResult.RemoveFromGroupByCollection(docid);
                        }
                    }

                    if (hasGroupByRecords)
                    {
                        docIdResult.CompreassGroupByCollection();
                    }

                    deleDocIdList = null;
                }

                return(deleteCount);
            }
        }
예제 #13
0
        /// <summary>
        /// Order by score desc
        /// and only one expression in the banch of expression tree.
        /// and more than two words
        /// </summary>
        /// <param name="upDict"></param>
        /// <param name="docIdRank"></param>
        /// <param name="wordIndexes"></param>
        unsafe private void CalculateWithPositionOrderByScoreDesc11(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                                                    ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank, WordIndexForQuery[] wordIndexes)
        {
            if (upDict != null)
            {
                throw new ParseException("UpDict is not null!");
            }

            Array.Sort(wordIndexes);

            //Calculate top
            int top;

            if (this._QueryParameter.End >= 0)
            {
                top = (1 + this._QueryParameter.End / 100) * 100;

                if (top <= 0)
                {
                    top = 100;
                }

                //if (this._QueryParameter.End * 2 > top)
                //{
                //    top *= 2;
                //}
            }
            else
            {
                top = int.MaxValue;
            }

            double ratio = 1;

            if (wordIndexes.Length > 1)
            {
                ratio = (double)2 / (double)(wordIndexes.Length - 1);
            }

            //Get max word doc list count
            int maxWordDocListCount = 0;
            int documentSum         = 0;

            foreach (WordIndexForQuery wifq in wordIndexes)
            {
                maxWordDocListCount += wifq.WordIndex.Count;
            }

            maxWordDocListCount += maxWordDocListCount / 2;

            if (maxWordDocListCount > 1024 * 1024)
            {
                maxWordDocListCount = 1024 * 1024;
            }

            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Calculate");

            bool groupbyScanAll = false;

            //Match for group by
            if (this._QueryParameter.NeedGroupBy)
            {
                groupbyScanAll = true;

                int    groupbyContainsCount = 0;
                int    groupbyLimit         = _DBProvider.Table.GroupByLimit;
                BitSet bitSet = new BitSet();

                for (int i = 0; i < wordIndexes.Length; i++)
                {
                    WordIndexForQuery           wifq    = wordIndexes[i];
                    Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();

                    while (docList.DocumentId >= 0)
                    {
                        if (bitSet.ForceAdd(docList.DocumentId))
                        {
                            groupbyContainsCount++;
                        }

                        if (groupbyContainsCount >= groupbyLimit)
                        {
                            groupbyScanAll = false;
                            break;
                        }

                        docList = wifq.WordIndex.GetNext();
                    }

                    wifq.WordIndex.Reset();

                    if (!groupbyScanAll)
                    {
                        break;
                    }
                }

                AscIntList groupByCollect = new AscIntList();
                groupByCollect.AddRange(bitSet);
            }

            //Merge
            int indexInTop = 0;

            for (int i = 0; i < wordIndexes.Length; i++)
            {
                if (docIdRank.Count >= top)
                {
                    break;
                }

                indexInTop = i;

                WordIndexForQuery wifq = wordIndexes[i];

                //Entity.DocumentPositionList[] wifqDocBuf = wifq.WordIndex.DocPositionBuf;

                Entity.DocumentPositionList docList = wifq.WordIndex.GetNext();
                int j = 0;

                while (docList.DocumentId >= 0)
                {
                    Core.SFQL.Parse.DocumentResultPoint drp;
                    drp.pDocumentResult = null;

                    long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }
                    else
                    {
                        switch (i)
                        {
                        case 0:
                            score *= 20;
                            break;

                        case 1:
                            score *= 4;
                            break;

                        case 2:
                            score *= 1;
                            break;

                        case 3:
                            score /= 2;
                            break;

                        default:
                            score /= i;
                            break;
                        }
                    }

                    if (score < 0)
                    {
                        //Overflow
                        score = long.MaxValue - 4000000;
                    }

                    bool exits = drp.pDocumentResult != null;

                    if (!exits && i > 0)
                    {
                        exits = docIdRank.TryGetValue(docList.DocumentId, out drp);
                    }

                    if (exits)
                    {
                        drp.pDocumentResult->Score += score;
                        drp.pDocumentResult->HitCount++;

                        double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition;
                        double positionDelta      = docList.FirstPosition - drp.pDocumentResult->LastPosition;

                        double delta = Math.Abs(queryPositionDelta - positionDelta);

                        if (delta < 0.031)
                        {
                            delta = 0.031;
                        }
                        else if (delta <= 1.1)
                        {
                            delta = 0.5;
                        }
                        else if (delta <= 2.1)
                        {
                            delta = 1;
                        }

                        delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount /
                                (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount);

                        drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta);

                        //Overflow, if match too much, sometime score would less than zero.
                        if (drp.pDocumentResult->Score < 0)
                        {
                            drp.pDocumentResult->Score = long.MaxValue - 4000000;
                        }

                        drp.pDocumentResult->LastIndex    = (UInt16)i;
                        drp.pDocumentResult->LastPosition = docList.FirstPosition;
                        drp.pDocumentResult->LastCount    = (UInt16)docList.Count;
                        drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition;
                    }
                    else
                    {
                        bool notInDict = false;

                        if (_NotInDict != null)
                        {
                            if (_NotInDict.ContainsKey(docList.DocumentId))
                            {
                                notInDict = true;
                            }
                        }

                        if (!notInDict)
                        {
                            //upDict is null in this function
                            DocumentResult docResult = new DocumentResult(docList.DocumentId, score, wifq.FirstPosition, wifq.QueryCount, docList.FirstPosition, docList.Count, i);
                            docIdRank.Add(docList.DocumentId, docResult);
                        }
                    }

                    docList = wifq.WordIndex.GetNext();
                    j++;

                    if (j > wifq.WordIndex.Count)
                    {
                        break;
                    }
                }
            }

            long maxScoreValue  = 0; //Max score value of the docid that hit count less than wordIndexes.Length
            int  wordIndexesLen = wordIndexes.Length;

            //Get the max score value of the docs that hit count less than wordIndexes.Length
            foreach (DocumentResultPoint docResult in docIdRank.Values)
            {
                if (docResult.pDocumentResult->HitCount < wordIndexesLen)
                {
                    if (docResult.pDocumentResult->Score > maxScoreValue)
                    {
                        maxScoreValue = docResult.pDocumentResult->Score;
                    }
                }
            }

            double hitRate = 0;

            if (indexInTop < wordIndexes.Length - 1)
            {
                int[] docidlist = new int[docIdRank.Count];

                int i = 0;
                foreach (int docid in docIdRank.Keys)
                {
                    docidlist[i] = docid;
                    i++;
                }

                Array.Sort(docidlist);

                int lastWordHitCount = 0;

                foreach (int firstDocId in docidlist)
                {
                    int curWord = indexInTop + 1;

                    Core.SFQL.Parse.DocumentResultPoint drp;

                    if (docIdRank.TryGetValue(firstDocId, out drp))
                    {
                        while (curWord < wordIndexesLen)
                        {
                            Entity.DocumentPositionList docList = wordIndexes[curWord].WordIndex.Get(firstDocId);
                            int curDocId = docList.DocumentId;

                            if (curDocId >= 0)
                            {
                                drp.pDocumentResult->HitCount++;

                                if (curWord == wordIndexesLen - 1)
                                {
                                    lastWordHitCount++;
                                }

                                WordIndexForQuery wifq = wordIndexes[curWord];

                                long score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)docList.Count * (long)1000000 / ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);

                                if (score < 0)
                                {
                                    //Overflow
                                    score = long.MaxValue - 4000000;
                                }
                                else
                                {
                                    switch (curWord)
                                    {
                                    case 0:
                                        score *= 20;
                                        break;

                                    case 1:
                                        score *= 4;
                                        break;

                                    case 2:
                                        score *= 1;
                                        break;

                                    case 3:
                                        score /= 2;
                                        break;

                                    default:
                                        score /= curWord;
                                        break;
                                    }
                                }

                                if (score < 0)
                                {
                                    //Overflow
                                    score = long.MaxValue - 4000000;
                                }

                                drp.pDocumentResult->Score += score;

                                double queryPositionDelta = wifq.FirstPosition - drp.pDocumentResult->LastWordIndexFirstPosition;
                                double positionDelta      = docList.FirstPosition - drp.pDocumentResult->LastPosition;

                                double delta = Math.Abs(queryPositionDelta - positionDelta);

                                if (delta < 0.031)
                                {
                                    delta = 0.031;
                                }
                                else if (delta <= 1.1)
                                {
                                    delta = 0.5;
                                }
                                else if (delta <= 2.1)
                                {
                                    delta = 1;
                                }

                                delta = Math.Pow((1 / delta), ratio) * docList.Count * drp.pDocumentResult->LastCount /
                                        (double)(wifq.QueryCount * drp.pDocumentResult->LastWordIndexQueryCount);

                                drp.pDocumentResult->Score = (long)(drp.pDocumentResult->Score * delta);



                                //Overflow, if match too much, sometime score would less than zero.
                                if (drp.pDocumentResult->Score < 0)
                                {
                                    drp.pDocumentResult->Score = long.MaxValue - 4000000;
                                }

                                drp.pDocumentResult->LastIndex    = (UInt16)curWord;
                                drp.pDocumentResult->LastPosition = docList.FirstPosition;
                                drp.pDocumentResult->LastCount    = (UInt16)docList.Count;
                                drp.pDocumentResult->LastWordIndexFirstPosition = (UInt16)wifq.FirstPosition;
                            }
                            curWord++;
                        } //While

                        if (drp.pDocumentResult->HitCount < wordIndexesLen)
                        {
                            if (drp.pDocumentResult->Score > maxScoreValue)
                            {
                                maxScoreValue = drp.pDocumentResult->Score;
                            }
                        }
                    }
                }

                if (docidlist.Length > 0)
                {
                    hitRate = (double)lastWordHitCount / (double)docidlist.Length;
                }
            }

            //Adjust score of the docs that hit count equal wordIndexes.Length
            foreach (DocumentResultPoint docResult in docIdRank.Values)
            {
                if (docResult.pDocumentResult->HitCount == wordIndexesLen)
                {
                    docResult.pDocumentResult->Score += maxScoreValue;

                    if (docResult.pDocumentResult->Score < 0)
                    {
                        docResult.pDocumentResult->Score = long.MaxValue;
                    }
                }
            }

            performanceReport.Stop();

            documentSum += docIdRank.Count;

            if (indexInTop < wordIndexes.Length - 1)
            {
                documentSum += wordIndexes[wordIndexes.Length - 1].RelTotalCount;

                if (hitRate > 0)
                {
                    int predictCount = 0;

                    for (int i = indexInTop + 1; i < wordIndexes.Length - 1; i++)
                    {
                        predictCount += (int)(wordIndexes[i].RelTotalCount * (1 - hitRate));
                    }

                    documentSum += predictCount;
                }
            }

            if (documentSum > _TotalDocuments)
            {
                documentSum = _TotalDocuments;
            }

            docIdRank.RelTotalCount = documentSum;

            DeleteProvider delProvider = _DBProvider.DelProvider;
            int            deleteCount = delProvider.Filter(docIdRank);

            docIdRank.RelTotalCount -= deleteCount;

            if (groupbyScanAll)
            {
                docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count;
            }
            else if (docIdRank.GroupByCollection.Count > docIdRank.RelTotalCount)
            {
                docIdRank.RelTotalCount = docIdRank.GroupByCollection.Count;
            }
        }
예제 #14
0
        public Core.SFQL.Parse.DocumentResultWhereDictionary Search()
        {
            Query.PerformanceReport performanceReport = new Hubble.Core.Query.PerformanceReport("Search of Match");

            Init();

            Core.SFQL.Parse.DocumentResultWhereDictionary result = new Core.SFQL.Parse.DocumentResultWhereDictionary();

            if (_QueryWords.Count <= 0 || _WordIndexes.Length <= 0)
            {
                if (_QueryParameter.Not && UpDict != null)
                {
                    return(UpDict);
                }
                else
                {
                    return(result);
                }
            }

            if (this._QueryParameter.Not)
            {
                if (_InvertedIndex.IndexMode == Field.IndexMode.Simple)
                {
                    Calculate(null, ref result, _WordIndexes);
                }
                else
                {
                    //For not, we have to return all of the records
                    //Modified at 18 Jan 2012
                    CalculateWithPosition(null, ref result, _WordIndexes);

                    //if (Optimize.OptimizeArgument.IsOrderByScoreDesc(this._QueryParameter.OrderBys) &&
                    //    !this._QueryParameter.NeedDistinct && this._QueryParameter.CanLoadPartOfDocs
                    //    && this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider)
                    //    && _WordIndexes.Length > 1 && this.UpDict == null)
                    //{
                    //    CalculateWithPositionOrderByScoreDesc(null, ref result, _WordIndexes);
                    //}
                    //else
                    //{
                    //    CalculateWithPosition(null, ref result, _WordIndexes);
                    //}
                }
            }
            else
            {
                if (_InvertedIndex.IndexMode == Field.IndexMode.Simple)
                {
                    Calculate(this.UpDict, ref result, _WordIndexes);
                }
                else
                {
                    if (//Optimize.OptimizeArgument.IsOrderByScoreDesc(this._QueryParameter.OrderBys) &&
                        !this._QueryParameter.NeedDistinct && this._QueryParameter.CanLoadPartOfDocs &&
                        this._QueryParameter.AndExpressionCanBeOptimized(_DBProvider) &&
                        _WordIndexes.Length > 1 && this.UpDict == null)
                    {
                        IQueryOptimize qOptimize = QueryOptimizeBuilder.Build(typeof(MatchOptimize),
                                                                              DBProvider, _QueryParameter.End, _QueryParameter.OrderBy,
                                                                              _QueryParameter.OrderBys, _QueryParameter.NeedGroupBy,
                                                                              _QueryParameter.OrderByCanBeOptimized, _QueryParameter.NeedFilterUntokenizedConditions(this._DBProvider),
                                                                              _QueryParameter.UntokenizedTreeOnRoot, _WordIndexes);

                        Query.PerformanceReport performanceReportCalculate = null;

                        try
                        {
                            performanceReportCalculate = new Hubble.Core.Query.PerformanceReport("Calculate");

                            qOptimize.CalculateOptimize(this.UpDict, ref result);
                        }
                        finally
                        {
                            performanceReportCalculate.Stop();
                        }
                    }
                    else
                    {
                        CalculateWithPosition(this.UpDict, ref result, _WordIndexes);
                    }
                }
            }

            if (this._QueryParameter.Not)
            {
                result.Not = true;

                if (UpDict != null)
                {
                    result = result.AndMergeForNot(result, UpDict);
                }
            }

            performanceReport.Stop();

            return(result);
        }
예제 #15
0
        /// <summary>
        /// order by except only order by score desc.
        /// </summary>
        /// <param name="upDict"></param>
        /// <param name="docIdRank"></param>
        unsafe public void CalculateOptimizeNormalOrderBy(Core.SFQL.Parse.DocumentResultWhereDictionary upDict,
                                                          ref Core.SFQL.Parse.DocumentResultWhereDictionary docIdRank)
        {
            DBProvider dBProvider = Argument.DBProvider;

            Argument.DBProvider.SharedPayloadProvider.EnterPayloladShareLock();

            bool           needFilterUntokenizedConditions = this.Argument.NeedFilterUntokenizedConditions;
            ExpressionTree untokenizedTree = this.Argument.UntokenizedTreeOnRoot;

            Query.DocumentResult  documentResult;
            Query.DocumentResult *drp = &documentResult;

            bool orderByIncludingScore = Argument.OrderByIncludingScore();

            try
            {
                Field[] orderByFields;

                DocId2LongComparer comparer = DocId2LongComparer.Generate(
                    dBProvider, Argument.OrderBys, out orderByFields);

                bool needGroupBy = Argument.NeedGroupBy;

                WordIndexForQuery wifq = WordIndexes[0];
                _IndexReader = wifq.WordIndex.IndexReader;

                Data.Field rankField = Argument.DBProvider.GetField("Rank");

                if (rankField != null)
                {
                    if (rankField.DataType == Hubble.Core.Data.DataType.Int &&
                        rankField.IndexType == Hubble.Core.Data.Field.Index.Untokenized)
                    {
                        _HasRandField         = true;
                        _RankTab              = rankField.TabIndex;
                        _DocidPayloads        = new OriginalDocumentPositionList[2 * 1024];
                        _CurDocidPayloadIndex = _DocidPayloads.Length;
                    }
                }

                if (_IndexReader != null)
                {
                    int top;

                    //vars for delete
                    bool  haveRecordsDeleted = dBProvider.DelProvider.Count > 0;
                    int[] delDocs            = null;
                    int   curDelIndex        = 0;
                    int   curDelDocid        = 0;
                    int   groupByCount       = 0;
                    int   groupByLen         = dBProvider.Table.GroupByLimit;
                    int   groupByStep        = 1;
                    int   groupByIndex       = 0;

                    if (needGroupBy)
                    {
                        groupByStep = wifq.RelTotalCount / groupByLen;

                        if (groupByStep <= 0)
                        {
                            groupByStep = 1;
                        }
                    }

                    if (haveRecordsDeleted)
                    {
                        delDocs     = dBProvider.DelProvider.DelDocs;
                        curDelDocid = delDocs[curDelIndex];
                    }

                    try
                    {
                        //calculate top
                        //If less than 100, set to 100
                        if (this.Argument.End >= 0)
                        {
                            top = (1 + this.Argument.End / 100) * 100;

                            if (top <= 0)
                            {
                                top = 100;
                            }

                            //if (this.Argument.End * 2 > top)
                            //{
                            //    top *= 2;
                            //}
                        }
                        else
                        {
                            top = int.MaxValue;
                        }

                        PriorQueue <Docid2Long> priorQueue = new PriorQueue <Docid2Long>(top, comparer);
                        int rows = 0;

                        Entity.OriginalDocumentPositionList docList = new OriginalDocumentPositionList();

                        bool notEOF = GetNext(ref docList);

                        Index.WordIndexReader wordIndexReader = wifq.WordIndex;

                        Docid2Long last = new Docid2Long();
                        last.DocId = -1;

                        int relCount = 0;

                        while (notEOF)
                        {
                            //Process untokenized conditions.
                            //If is not matched, get the next one.
                            if (needFilterUntokenizedConditions)
                            {
                                int docId = docList.DocumentId;
                                drp->DocId       = docId;
                                drp->PayloadData = dBProvider.GetPayloadDataWithShareLock(docId);
                                if (!ParseWhere.GetComparisionExpressionValue(dBProvider, drp,
                                                                              untokenizedTree))
                                {
                                    notEOF = GetNext(ref docList);
                                    continue;
                                }
                            }

                            //Process deleted records
                            if (haveRecordsDeleted)
                            {
                                if (curDelIndex < delDocs.Length)
                                {
                                    //If docid deleted, get next
                                    if (docList.DocumentId == curDelDocid)
                                    {
                                        notEOF = GetNext(ref docList);
                                        continue;
                                    }
                                    else if (docList.DocumentId > curDelDocid)
                                    {
                                        while (curDelIndex < delDocs.Length && curDelDocid < docList.DocumentId)
                                        {
                                            curDelIndex++;

                                            if (curDelIndex >= delDocs.Length)
                                            {
                                                haveRecordsDeleted = false;
                                                break;
                                            }

                                            curDelDocid = delDocs[curDelIndex];
                                        }

                                        if (curDelIndex < delDocs.Length)
                                        {
                                            if (docList.DocumentId == curDelDocid)
                                            {
                                                notEOF = GetNext(ref docList);
                                                continue;
                                            }
                                        }
                                    }
                                }
                            }

                            if (needGroupBy)
                            {
                                if (groupByCount < groupByLen)
                                {
                                    if (groupByIndex >= groupByStep)
                                    {
                                        groupByIndex = 0;
                                    }

                                    if (groupByIndex == 0)
                                    {
                                        docIdRank.AddToGroupByCollection(docList.DocumentId);
                                        groupByCount++;
                                    }

                                    groupByIndex++;
                                }
                            }

                            relCount++;

                            Docid2Long cur = new Docid2Long();

                            if (rows >= top)
                            {
                                long score = 1;

                                if (orderByIncludingScore)
                                {
                                    int wordCount = docList.CountAndWordCount / 8; //one word, score = count
                                    score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)wordCount * (long)1000000 /
                                            ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);
                                }

                                cur.DocId = docList.DocumentId;
                                cur.Rank  = docList.TotalWordsInThisDocument;

                                Docid2Long.Generate(ref cur, dBProvider, orderByFields, score);

                                if (comparer.Compare(last, cur) > 0)
                                {
                                    priorQueue.Add(cur);
                                    last = priorQueue.Last;
                                }
                            }
                            else
                            {
                                long score = 1;

                                if (orderByIncludingScore)
                                {
                                    int wordCount = docList.CountAndWordCount / 8; //one word, score = count
                                    score = (long)wifq.FieldRank * (long)wifq.WordRank * (long)wifq.Idf_t * (long)wordCount * (long)1000000 /
                                            ((long)wifq.Sum_d_t * (long)docList.TotalWordsInThisDocument);
                                }

                                if (score < 0)
                                {
                                    //Overflow
                                    score = long.MaxValue - 4000000;
                                }

                                cur.DocId = docList.DocumentId;
                                cur.Rank  = docList.TotalWordsInThisDocument;

                                Docid2Long.Generate(ref cur, dBProvider, orderByFields, score);

                                priorQueue.Add(cur);
                                rows++;

                                if (rows == top)
                                {
                                    last = priorQueue.Last;
                                }
                            }

                            notEOF = GetNext(ref docList);
                        }

                        docIdRank.RelTotalCount = relCount;

                        foreach (Docid2Long docid2Long in priorQueue.ToArray())
                        {
                            long score = comparer.GetScore(docid2Long); //use Rank store TotalWordsInThisDocument

                            if (score < 0)
                            {
                                //Overflow
                                score = long.MaxValue - 4000000;
                            }

                            docIdRank.Add(docid2Long.DocId, new DocumentResult(docid2Long.DocId, score));
                        }
                    }
                    finally
                    {
                    }

                    docIdRank.Sorted = true;
                }
            }
            finally
            {
                Argument.DBProvider.SharedPayloadProvider.LeavePayloadShareLock();
            }
        }