コード例 #1
0
 /// <summary>
 /// Move all PPs to their first position </summary>
 private void PlaceFirstPositions()
 {
     for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max
     {
         pp.FirstPosition();
     }
 }
コード例 #2
0
 internal SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, int slop, Similarity.SimScorer docScorer)
     : base(weight)
 {
     this.docScorer   = docScorer;
     this.slop        = slop;
     this.numPostings = postings == null ? 0 : postings.Length;
     pq = new PhraseQueue(postings.Length);
     // min(cost)
     cost = postings[0].postings.GetCost();
     // convert tps to a list of phrase positions.
     // note: phrase-position differs from term-position in that its position
     // reflects the phrase offset: pp.pos = tp.pos - offset.
     // this allows to easily identify a matching (exact) phrase
     // when all PhrasePositions have exactly the same position.
     if (postings.Length > 0)
     {
         min     = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms);
         max     = min;
         max.doc = -1;
         for (int i = 1; i < postings.Length; i++)
         {
             PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
             max.next = pp;
             max      = pp;
             max.doc  = -1;
         }
         max.next = min; // make it cyclic for easier manipulation
     }
 }
コード例 #3
0
 /// <summary>
 /// Compare two pps, but only by position and offset </summary>
 private PhrasePositions Lesser(PhrasePositions pp, PhrasePositions pp2)
 {
     if (pp.position < pp2.position || (pp.position == pp2.position && pp.offset < pp2.offset))
     {
         return(pp);
     }
     return(pp2);
 }
コード例 #4
0
ファイル: SloppyPhraseScorer.cs プロジェクト: wow64bb/YAFNET
 private static PhrasePositions Lesser(PhrasePositions pp, PhrasePositions pp2) // LUCENENET: CA1822: Mark members as static
 {
     if (pp.position < pp2.position || (pp.position == pp2.position && pp.offset < pp2.offset))
     {
         return(pp);
     }
     return(pp2);
 }
コード例 #5
0
        //  private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
        //    //if (min.doc != ?) return;
        //    ps.println();
        //    ps.println("---- "+title);
        //    ps.println("EXT: "+ext);
        //    PhrasePositions[] t = new PhrasePositions[pq.size()];
        //    if (pq.size()>0) {
        //      t[0] = pq.pop();
        //      ps.println("  " + 0 + "  " + t[0]);
        //      for (int i=1; i<t.length; i++) {
        //        t[i] = pq.pop();
        //        assert t[i-1].position <= t[i].position;
        //        ps.println("  " + i + "  " + t[i]);
        //      }
        //      // add them back
        //      for (int i=t.length-1; i>=0; i--) {
        //        pq.add(t[i]);
        //      }
        //    }
        //  }

        private bool AdvanceMin(int target)
        {
            if (!min.SkipTo(target))
            {
                max.doc = NO_MORE_DOCS; // for further calls to docID()
                return(false);
            }
            min = min.next; // cyclic
            max = max.next; // cyclic
            return(true);
        }
コード例 #6
0
 /// <summary>
 /// Fill the queue (all pps are already placed) </summary>
 private void FillQueue()
 {
     pq.Clear();
     for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max
     {
         if (pp.position > end)
         {
             end = pp.position;
         }
         pq.Add(pp);
     }
 }
コード例 #7
0
 /// <summary>
 /// Advance a PhrasePosition and update 'end', return false if exhausted </summary>
 private bool AdvancePP(PhrasePositions pp)
 {
     if (!pp.NextPosition())
     {
         return(false);
     }
     if (pp.position > end)
     {
         end = pp.position;
     }
     return(true);
 }
コード例 #8
0
 /// <summary>
 /// No repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient </summary>
 private void InitSimple()
 {
     //System.err.println("initSimple: doc: "+min.doc);
     pq.Clear();
     // position pps and build queue from list
     for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max
     {
         pp.FirstPosition();
         if (pp.position > end)
         {
             end = pp.position;
         }
         pq.Add(pp);
     }
 }
コード例 #9
0
        /// <summary>
        /// Index of a pp2 colliding with pp, or -1 if none </summary>
        private int Collide(PhrasePositions pp)
        {
            int tpPos = TpPos(pp);

            PhrasePositions[] rg = rptGroups[pp.rptGroup];
            for (int i = 0; i < rg.Length; i++)
            {
                PhrasePositions pp2 = rg[i];
                if (pp2 != pp && TpPos(pp2) == tpPos)
                {
                    return(pp2.rptInd);
                }
            }
            return(-1);
        }
コード例 #10
0
        /// <summary>
        /// Score a candidate doc for all slop-valid position-combinations (matches)
        /// encountered while traversing/hopping the PhrasePositions.
        /// <para/> The score contribution of a match depends on the distance:
        /// <para/> - highest score for distance=0 (exact match).
        /// <para/> - score gets lower as distance gets higher.
        /// <para/>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
        /// once for "a b" (distance=0), and once for "b a" (distance=2).
        /// <para/>Possibly not all valid combinations are encountered, because for efficiency
        /// we always propagate the least PhrasePosition. This allows to base on
        /// <see cref="Util.PriorityQueue{T}"/> and move forward faster.
        /// As result, for example, document "a b c b a"
        /// would score differently for queries "a b c"~4 and "c b a"~4, although
        /// they really are equivalent.
        /// Similarly, for doc "a b c b a f g", query "c b"~2
        /// would get same score as "g f"~2, although "c b"~2 could be matched twice.
        /// We may want to fix this in the future (currently not, for performance reasons).
        /// </summary>
        private float PhraseFreq()
        {
            if (!InitPhrasePositions())
            {
                return(0.0f);
            }
            float freq = 0.0f;

            numMatches = 0;
            PhrasePositions pp          = pq.Pop();
            int             matchLength = end - pp.position;
            int             next        = pq.Top.position;

            while (AdvancePP(pp))
            {
                if (hasRpts && !AdvanceRpts(pp))
                {
                    break;              // pps exhausted
                }
                if (pp.position > next) // done minimizing current match-length
                {
                    if (matchLength <= slop)
                    {
                        freq += docScorer.ComputeSlopFactor(matchLength); // score match
                        numMatches++;
                    }
                    pq.Add(pp);
                    pp          = pq.Pop();
                    next        = pq.Top.position;
                    matchLength = end - pp.position;
                }
                else
                {
                    int matchLength2 = end - pp.position;
                    if (matchLength2 < matchLength)
                    {
                        matchLength = matchLength2;
                    }
                }
            }
            if (matchLength <= slop)
            {
                freq += docScorer.ComputeSlopFactor(matchLength); // score match
                numMatches++;
            }
            return(freq);
        }
コード例 #11
0
        /// <summary>
        /// pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
        /// of the two colliding pps. Note that there can only be one collision, as by the initialization
        /// there were no collisions before pp was advanced.
        /// </summary>
        private bool AdvanceRpts(PhrasePositions pp)
        {
            if (pp.rptGroup < 0)
            {
                return(true); // not a repeater
            }
            PhrasePositions[] rg   = rptGroups[pp.rptGroup];
            FixedBitSet       bits = new FixedBitSet(rg.Length); // for re-queuing after collisions are resolved
            int k0 = pp.rptInd;
            int k;

            while ((k = Collide(pp)) >= 0)
            {
                pp = Lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
                if (!AdvancePP(pp))
                {
                    return(false); // exhausted
                }
                if (k != k0)       // careful: mark only those currently in the queue
                {
                    bits = FixedBitSet.EnsureCapacity(bits, k);
                    bits.Set(k); // mark that pp2 need to be re-queued
                }
            }
            // collisions resolved, now re-queue
            // empty (partially) the queue until seeing all pps advanced for resolving collisions
            int n = 0;
            // TODO would be good if we can avoid calling cardinality() in each iteration!
            int numBits = bits.Length; // larges bit we set

            while (bits.Cardinality() > 0)
            {
                PhrasePositions pp2 = pq.Pop();
                rptStack[n++] = pp2;
                if (pp2.rptGroup >= 0 && pp2.rptInd < numBits && bits.Get(pp2.rptInd)) // this bit may not have been set
                {
                    bits.Clear(pp2.rptInd);
                }
            }
            // add back to queue
            for (int i = n - 1; i >= 0; i--)
            {
                pq.Add(rptStack[i]);
            }
            return(true);
        }
コード例 #12
0
 /// <summary>
 /// At initialization (each doc), each repetition group is sorted by (query) offset.
 /// this provides the start condition: no collisions.
 /// <para/>Case 1: no multi-term repeats
 /// <para/>
 /// It is sufficient to advance each pp in the group by one less than its group index.
 /// So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
 /// <para/>Case 2: multi-term repeats
 /// </summary>
 /// <returns> <c>false</c> if PPs are exhausted.  </returns>
 private bool AdvanceRepeatGroups()
 {
     foreach (PhrasePositions[] rg in rptGroups)
     {
         if (hasMultiTermRpts)
         {
             // more involved, some may not collide
             int incr;
             for (int i = 0; i < rg.Length; i += incr)
             {
                 incr = 1;
                 PhrasePositions pp = rg[i];
                 int             k;
                 while ((k = Collide(pp)) >= 0)
                 {
                     PhrasePositions pp2 = Lesser(pp, rg[k]);
                     if (!AdvancePP(pp2)) // at initialization always advance pp with higher offset
                     {
                         return(false);   // exhausted
                     }
                     if (pp2.rptInd < i)  // should not happen?
                     {
                         incr = 0;
                         break;
                     }
                 }
             }
         }
         else
         {
             // simpler, we know exactly how much to advance
             for (int j = 1; j < rg.Length; j++)
             {
                 for (int k = 0; k < j; k++)
                 {
                     if (!rg[j].NextPosition())
                     {
                         return(false); // PPs exhausted
                     }
                 }
             }
         }
     }
     return(true); // PPs available
 }
コード例 #13
0
        /// <summary>
        /// Find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts </summary>
        private PhrasePositions[] RepeatingPPs(HashMap <Term, int?> rptTerms)
        {
            List <PhrasePositions> rp = new List <PhrasePositions>();

            for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max
            {
                foreach (Term t in pp.terms)
                {
                    if (rptTerms.ContainsKey(t))
                    {
                        rp.Add(pp);
                        hasMultiTermRpts |= (pp.terms.Length > 1);
                        break;
                    }
                }
            }
            return(rp.ToArray());
        }
コード例 #14
0
ファイル: SloppyPhraseScorer.cs プロジェクト: wow64bb/YAFNET
        /// <summary>
        /// Find repeating terms and assign them ordinal values </summary>
        private JCG.LinkedDictionary <Term, int?> RepeatingTerms()
        {
            JCG.LinkedDictionary <Term, int?> tord = new JCG.LinkedDictionary <Term, int?>();
            Dictionary <Term, int?>           tcnt = new Dictionary <Term, int?>();

            for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max
            {
                foreach (Term t in pp.terms)
                {
                    tcnt.TryGetValue(t, out int?cnt0);
                    int?cnt = cnt0 == null ? new int?(1) : new int?(1 + (int)cnt0);
                    tcnt[t] = cnt;
                    if (cnt == 2)
                    {
                        tord[t] = tord.Count;
                    }
                }
            }
            return(tord);
        }
コード例 #15
0
 /// <summary>
 /// Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) </summary>
 private int TpPos(PhrasePositions pp)
 {
     return(pp.position + pp.offset);
 }
コード例 #16
0
        /// <summary>
        /// Detect repetition groups. Done once - for first doc. </summary>
        private IList <IList <PhrasePositions> > GatherRptGroups(LinkedHashMap <Term, int?> rptTerms)
        {
            PhrasePositions[] rpp = RepeatingPPs(rptTerms);
            IList <IList <PhrasePositions> > res = new List <IList <PhrasePositions> >();

            if (!hasMultiTermRpts)
            {
                // simpler - no multi-terms - can base on positions in first doc
                for (int i = 0; i < rpp.Length; i++)
                {
                    PhrasePositions pp = rpp[i];
                    if (pp.rptGroup >= 0) // already marked as a repetition
                    {
                        continue;
                    }
                    int tpPos = TpPos(pp);
                    for (int j = i + 1; j < rpp.Length; j++)
                    {
                        PhrasePositions pp2 = rpp[j];
                        if (pp2.rptGroup >= 0 || pp2.offset == pp.offset || TpPos(pp2) != tpPos) // not a repetition -  not a repetition: two PPs are originally in same offset in the query! -  already marked as a repetition
                        {
                            continue;
                        }
                        // a repetition
                        int g = pp.rptGroup;
                        if (g < 0)
                        {
                            g           = res.Count;
                            pp.rptGroup = g;
                            List <PhrasePositions> rl = new List <PhrasePositions>(2);
                            rl.Add(pp);
                            res.Add(rl);
                        }
                        pp2.rptGroup = g;
                        res[g].Add(pp2);
                    }
                }
            }
            else
            {
                // more involved - has multi-terms
                List <HashSet <PhrasePositions> > tmp = new List <HashSet <PhrasePositions> >();
                IList <FixedBitSet> bb = PpTermsBitSets(rpp, rptTerms);
                UnionTermGroups(bb);
                IDictionary <Term, int> tg = TermGroups(rptTerms, bb);
                HashSet <int>           distinctGroupIDs = new HashSet <int>(tg.Values);
                for (int i = 0; i < distinctGroupIDs.Count; i++)
                {
                    tmp.Add(new HashSet <PhrasePositions>());
                }
                foreach (PhrasePositions pp in rpp)
                {
                    foreach (Term t in pp.terms)
                    {
                        if (rptTerms.ContainsKey(t))
                        {
                            int g = tg[t];
                            tmp[g].Add(pp);
                            Debug.Assert(pp.rptGroup == -1 || pp.rptGroup == g);
                            pp.rptGroup = g;
                        }
                    }
                }
                foreach (HashSet <PhrasePositions> hs in tmp)
                {
                    res.Add(new List <PhrasePositions>(hs));
                }
            }
            return(res);
        }
コード例 #17
0
ファイル: SloppyPhraseScorer.cs プロジェクト: wow64bb/YAFNET
 private static int TpPos(PhrasePositions pp) // LUCENENET: CA1822: Mark members as static
 {
     return(pp.position + pp.offset);
 }