Exemple #1
0
 /// <summary>
 /// Advance a PhrasePosition and update 'end', return false if exhausted </summary>
 private bool AdvancePP(PhrasePositions pp)
 {
     if (!pp.NextPosition())
     {
         return(false);
     }
     if (pp.position > end)
     {
         end = pp.position;
     }
     return(true);
 }
        /// <summary> Score a candidate doc for all slop-valid position-combinations (matches)
        /// encountered while traversing/hopping the PhrasePositions.
        /// <br/> The score contribution of a match depends on the distance:
        /// <br/> - highest score for distance=0 (exact match).
        /// <br/> - score gets lower as distance gets higher.
        /// <br/>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
        /// once for "a b" (distance=0), and once for "b a" (distance=2).
        /// <br/>Possibly not all valid combinations are encountered, because for efficiency
        /// we always propagate the least PhrasePosition. This allows to base on
        /// PriorityQueue and move forward faster.
        /// As result, for example, document "a b c b a"
        /// would score differently for queries "a b c"~4 and "c b a"~4, although
        /// they really are equivalent.
        /// Similarly, for doc "a b c b a f g", query "c b"~2
        /// would get same score as "g f"~2, although "c b"~2 could be matched twice.
        /// We may want to fix this in the future (currently not, for performance reasons).
        /// </summary>
        protected internal override float PhraseFreq()
        {
            int end = InitPhrasePositions();

            float freq = 0.0f;
            bool  done = (end < 0);

            while (!done)
            {
                PhrasePositions pp    = (PhrasePositions)pq.Pop();
                int             start = pp.position;
                int             next  = ((PhrasePositions)pq.Top()).position;

                bool tpsDiffer = true;
                for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position)
                {
                    if (pos <= next && tpsDiffer)
                    {
                        start = pos;                         // advance pp to min window
                    }
                    if (!pp.NextPosition())
                    {
                        done = true;                         // ran out of a term -- done
                        break;
                    }
                    PhrasePositions pp2 = null;
                    tpsDiffer = !pp.repeats || (pp2 = TermPositionsDiffer(pp)) == null;
                    if (pp2 != null && pp2 != pp)
                    {
                        pp = Flip(pp, pp2);                         // flip pp to pp2
                    }
                }

                int matchLength = end - start;
                if (matchLength <= slop)
                {
                    freq += GetSimilarity().SloppyFreq(matchLength);                     // score match
                }
                if (pp.position > end)
                {
                    end = pp.position;
                }
                pq.Put(pp);                 // restore pq
            }

            return(freq);
        }
        protected internal override float PhraseFreq()
        {
            pq.Clear();
            int end = 0;

            for (PhrasePositions pp = first; pp != null; pp = pp.next)
            {
                pp.FirstPosition();
                if (pp.position > end)
                {
                    end = pp.position;
                }
                pq.Put(pp);                 // build pq from list
            }

            float freq = 0.0f;
            bool  done = false;

            do
            {
                PhrasePositions pp    = (PhrasePositions)pq.Pop();
                int             start = pp.position;
                int             next  = ((PhrasePositions)pq.Top()).position;
                for (int pos = start; pos <= next; pos = pp.position)
                {
                    start = pos;                     // advance pp to min window
                    if (!pp.NextPosition())
                    {
                        done = true;                         // ran out of a term -- done
                        break;
                    }
                }

                int matchLength = end - start;
                if (matchLength <= slop)
                {
                    freq += GetSimilarity().SloppyFreq(matchLength);                     // score match
                }
                if (pp.position > end)
                {
                    end = pp.position;
                }
                pq.Put(pp);                 // restore pq
            }while (!done);

            return(freq);
        }
Exemple #4
0
        /// <summary> Init PhrasePositions in place.
        /// There is a one time initializatin for this scorer:
        /// <br>- Put in repeats[] each pp that has another pp with same position in the doc.
        /// <br>- Also mark each such pp by pp.repeats = true.
        /// <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
        /// In particular, this allows to score queries with no repetiotions with no overhead due to this computation.
        /// <br>- Example 1 - query with no repetitions: "ho my"~2
        /// <br>- Example 2 - query with repetitions: "ho my my"~2
        /// <br>- Example 3 - query with repetitions: "my ho my"~2
        /// <br>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
        /// </summary>
        /// <returns> end (max position), or -1 if any term ran out (i.e. done)
        /// </returns>
        /// <throws>  IOException  </throws>
        private int InitPhrasePositions()
        {
            int end = 0;

            // no repeats at all (most common case is also the simplest one)
            if (checkedRepeats && repeats == null)
            {
                // build queue from list
                pq.Clear();
                for (PhrasePositions pp = first; pp != null; pp = pp.next)
                {
                    pp.FirstPosition();
                    if (pp.position > end)
                    {
                        end = pp.position;
                    }
                    pq.Put(pp);                     // build pq from list
                }
                return(end);
            }

            // position the pp's
            for (PhrasePositions pp = first; pp != null; pp = pp.next)
            {
                pp.FirstPosition();
            }

            // one time initializatin for this scorer
            if (!checkedRepeats)
            {
                checkedRepeats = true;
                // check for repeats
                System.Collections.Hashtable m = null;
                for (PhrasePositions pp = first; pp != null; pp = pp.next)
                {
                    int tpPos = pp.position + pp.offset;
                    for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next)
                    {
                        int tpPos2 = pp2.position + pp2.offset;
                        if (tpPos2 == tpPos)
                        {
                            if (m == null)
                            {
                                m = new System.Collections.Hashtable();
                            }
                            pp.repeats  = true;
                            pp2.repeats = true;
                            m[pp]       = null;
                            m[pp2]      = null;
                        }
                    }
                }
                if (m != null)
                {
                    repeats = (PhrasePositions[])(new System.Collections.ArrayList(m.Keys).ToArray(typeof(PhrasePositions)));
                }
            }

            // with repeats must advance some repeating pp's so they all start with differing tp's
            if (repeats != null)
            {
                // must propagate higher offsets first (otherwise might miss matches).
                System.Array.Sort(repeats, new AnonymousClassComparator(this));
                // now advance them
                for (int i = 0; i < repeats.Length; i++)
                {
                    PhrasePositions pp = repeats[i];
                    while (!TermPositionsDiffer(pp))
                    {
                        if (!pp.NextPosition())
                        {
                            return(-1);                             // ran out of a term -- done
                        }
                    }
                }
            }

            // build queue from list
            pq.Clear();
            for (PhrasePositions pp = first; pp != null; pp = pp.next)
            {
                if (pp.position > end)
                {
                    end = pp.position;
                }
                pq.Put(pp);                 // build pq from list
            }

            return(end);
        }