private void PlaceFirstPositions() { for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max { pp.FirstPosition(); } }
private float freq; //prhase frequency in current doc as computed by phraseFreq(). internal PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms) : base(similarity) { this.norms = norms; this.weight = weight; this.value_Renamed = weight.Value; // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. for (int i = 0; i < tps.Length; i++) { PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]); if (last != null) { // add next to end of list last.next = pp; } else { first = pp; } last = pp; } pq = new PhraseQueue(tps.Length); // construct empty pq first.doc = -1; }
internal SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, int slop, Similarity.SimScorer docScorer) : base(weight) { this.docScorer = docScorer; this.slop = slop; this.numPostings = postings == null ? 0 : postings.Length; pq = new PhraseQueue(postings.Length); // min(cost) cost = postings[0].postings.GetCost(); // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. if (postings.Length > 0) { min = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms); max = min; max.doc = -1; for (int i = 1; i < postings.Length; i++) { PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms); max.next = pp; max = pp; max.doc = -1; } max.next = min; // make it cyclic for easier manipulation } }
private float freq; //prhase frequency in current doc as computed by phraseFreq(). internal PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms):base(similarity) { this.norms = norms; this.weight = weight; this.value_Renamed = weight.Value; // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. for (int i = 0; i < tps.Length; i++) { PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]); if (last != null) { // add next to end of list last.next = pp; } else { first = pp; } last = pp; } pq = new PhraseQueue(tps.Length); // construct empty pq first.doc = - 1; }
protected internal override float PhraseFreq() { // sort list with pq for (PhrasePositions pp = first; pp != null; pp = pp.next) { pp.FirstPosition(); pq.Put(pp); // build pq from list } PqToList(); // rebuild list from pq int freq = 0; do { // find position w/ all terms while (first.position < last.position) { // scan forward in first do { if (!first.NextPosition()) { return((float)freq); } }while (first.position < last.position); FirstToLast(); } freq++; // all equal: a match }while (last.NextPosition()); return((float)freq); }
protected internal void FirstToLast() { last.next = first; // move first to end of list last = first; first = first.next; last.next = null; }
protected internal override float PhraseFreq() { // sort list with pq pq.Clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { pp.FirstPosition(); pq.Put(pp); // build pq from list } PqToList(); // rebuild list from pq // for counting how many times the exact phrase is found in current document, // just count how many times all PhrasePosition's have exactly the same position. int freq = 0; do { // find position w/ all terms while (first.position < last.position) { // scan forward in first do { if (!first.NextPosition()) { return(freq); } }while (first.position < last.position); FirstToLast(); } freq++; // all equal: a match }while (last.NextPosition()); return(freq); }
private static PhrasePositions Lesser(PhrasePositions pp, PhrasePositions pp2) // LUCENENET: CA1822: Mark members as static { if (pp.position < pp2.position || (pp.position == pp2.position && pp.offset < pp2.offset)) { return(pp); } return(pp2); }
/// <summary> /// Compare two pps, but only by position and offset </summary> private PhrasePositions Lesser(PhrasePositions pp, PhrasePositions pp2) { if (pp.position < pp2.position || (pp.position == pp2.position && pp.offset < pp2.offset)) { return(pp); } return(pp2); }
private void Sort() { pq.Clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { pq.Add(pp); } PqToList(); }
private void Init() { for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { more = pp.Next(); } if (more) { Sort(); } }
// private void printQueue(PrintStream ps, PhrasePositions ext, String title) { // //if (min.doc != ?) return; // ps.println(); // ps.println("---- "+title); // ps.println("EXT: "+ext); // PhrasePositions[] t = new PhrasePositions[pq.size()]; // if (pq.size()>0) { // t[0] = pq.pop(); // ps.println(" " + 0 + " " + t[0]); // for (int i=1; i<t.length; i++) { // t[i] = pq.pop(); // assert t[i-1].position <= t[i].position; // ps.println(" " + i + " " + t[i]); // } // // add them back // for (int i=t.length-1; i>=0; i--) { // pq.add(t[i]); // } // } // } private bool AdvanceMin(int target) { if (!min.SkipTo(target)) { max.doc = NO_MORE_DOCS; // for further calls to docID() return(false); } min = min.next; // cyclic max = max.next; // cyclic return(true); }
// private void printQueue(PrintStream ps, PhrasePositions ext, String title) { // //if (min.doc != ?) return; // ps.println(); // ps.println("---- "+title); // ps.println("EXT: "+ext); // PhrasePositions[] t = new PhrasePositions[pq.size()]; // if (pq.size()>0) { // t[0] = pq.pop(); // ps.println(" " + 0 + " " + t[0]); // for (int i=1; i<t.length; i++) { // t[i] = pq.pop(); // assert t[i-1].position <= t[i].position; // ps.println(" " + i + " " + t[i]); // } // // add them back // for (int i=t.length-1; i>=0; i--) { // pq.add(t[i]); // } // } // } private bool AdvanceMin(int target) { if (!Min.SkipTo(target)) { Max.Doc = NO_MORE_DOCS; // for further calls to docID() return(false); } Min = Min.next; // cyclic Max = Max.next; // cyclic return(true); }
/// <summary> /// Fill the queue (all pps are already placed </summary> private void FillQueue() { Pq.Clear(); for (PhrasePositions pp = Min, prev = null; prev != Max; pp = (prev = pp).next) // iterate cyclic list: done once handled max { if (pp.Position > End) { End = pp.Position; } Pq.Add(pp); } }
/// <summary> /// Advance a PhrasePosition and update 'end', return false if exhausted </summary> private bool AdvancePP(PhrasePositions pp) { if (!pp.NextPosition()) { return(false); } if (pp.position > end) { end = pp.position; } return(true); }
/// <summary> /// Fill the queue (all pps are already placed) </summary> private void FillQueue() { pq.Clear(); for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max { if (pp.position > end) { end = pp.position; } pq.Add(pp); } }
public override bool SkipTo(int target) { for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { more = pp.SkipTo(target); } if (more) { Sort(); // re-sort } return(DoNext()); }
public override bool LessThan(System.Object o1, System.Object o2) { PhrasePositions pp1 = (PhrasePositions)o1; PhrasePositions pp2 = (PhrasePositions)o2; if (pp1.doc == pp2.doc) { return(pp1.position < pp2.position); } else { return(pp1.doc < pp2.doc); } }
/// <summary> /// No repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient </summary> private void InitSimple() { //System.err.println("initSimple: doc: "+min.doc); pq.Clear(); // position pps and build queue from list for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max { pp.FirstPosition(); if (pp.position > end) { end = pp.position; } pq.Add(pp); } }
/// <summary> /// Index of a pp2 colliding with pp, or -1 if none </summary> private int Collide(PhrasePositions pp) { int tpPos = TpPos(pp); PhrasePositions[] rg = rptGroups[pp.rptGroup]; for (int i = 0; i < rg.Length; i++) { PhrasePositions pp2 = rg[i]; if (pp2 != pp && TpPos(pp2) == tpPos) { return(pp2.rptInd); } } return(-1); }
protected internal override float PhraseFreq() { pq.Clear(); int end = 0; for (PhrasePositions pp = first; pp != null; pp = pp.next) { pp.FirstPosition(); if (pp.position > end) { end = pp.position; } pq.Put(pp); // build pq from list } float freq = 0.0f; bool done = false; do { PhrasePositions pp = (PhrasePositions)pq.Pop(); int start = pp.position; int next = ((PhrasePositions)pq.Top()).position; for (int pos = start; pos <= next; pos = pp.position) { start = pos; // advance pp to min window if (!pp.NextPosition()) { done = true; // ran out of a term -- done break; } } int matchLength = end - start; if (matchLength <= slop) { freq += GetSimilarity().SloppyFreq(matchLength); // score match } if (pp.position > end) { end = pp.position; } pq.Put(pp); // restore pq }while (!done); return(freq); }
/// <summary> Score a candidate doc for all slop-valid position-combinations (matches) /// encountered while traversing/hopping the PhrasePositions. /// <br/> The score contribution of a match depends on the distance: /// <br/> - highest score for distance=0 (exact match). /// <br/> - score gets lower as distance gets higher. /// <br/>Example: for query "a b"~2, a document "x a b a y" can be scored twice: /// once for "a b" (distance=0), and once for "b a" (distance=2). /// <br/>Possibly not all valid combinations are encountered, because for efficiency /// we always propagate the least PhrasePosition. This allows to base on /// PriorityQueue and move forward faster. /// As result, for example, document "a b c b a" /// would score differently for queries "a b c"~4 and "c b a"~4, although /// they really are equivalent. /// Similarly, for doc "a b c b a f g", query "c b"~2 /// would get same score as "g f"~2, although "c b"~2 could be matched twice. /// We may want to fix this in the future (currently not, for performance reasons). /// </summary> protected internal override float PhraseFreq() { int end = InitPhrasePositions(); float freq = 0.0f; bool done = (end < 0); while (!done) { PhrasePositions pp = (PhrasePositions)pq.Pop(); int start = pp.position; int next = ((PhrasePositions)pq.Top()).position; bool tpsDiffer = true; for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) { if (pos <= next && tpsDiffer) { start = pos; // advance pp to min window } if (!pp.NextPosition()) { done = true; // ran out of a term -- done break; } PhrasePositions pp2 = null; tpsDiffer = !pp.repeats || (pp2 = TermPositionsDiffer(pp)) == null; if (pp2 != null && pp2 != pp) { pp = Flip(pp, pp2); // flip pp to pp2 } } int matchLength = end - start; if (matchLength <= slop) { freq += GetSimilarity().SloppyFreq(matchLength); // score match } if (pp.position > end) { end = pp.position; } pq.Put(pp); // restore pq } return(freq); }
/// <summary> /// Score a candidate doc for all slop-valid position-combinations (matches) /// encountered while traversing/hopping the PhrasePositions. /// <para/> The score contribution of a match depends on the distance: /// <para/> - highest score for distance=0 (exact match). /// <para/> - score gets lower as distance gets higher. /// <para/>Example: for query "a b"~2, a document "x a b a y" can be scored twice: /// once for "a b" (distance=0), and once for "b a" (distance=2). /// <para/>Possibly not all valid combinations are encountered, because for efficiency /// we always propagate the least PhrasePosition. This allows to base on /// <see cref="Util.PriorityQueue{T}"/> and move forward faster. /// As result, for example, document "a b c b a" /// would score differently for queries "a b c"~4 and "c b a"~4, although /// they really are equivalent. /// Similarly, for doc "a b c b a f g", query "c b"~2 /// would get same score as "g f"~2, although "c b"~2 could be matched twice. /// We may want to fix this in the future (currently not, for performance reasons). /// </summary> private float PhraseFreq() { if (!InitPhrasePositions()) { return(0.0f); } float freq = 0.0f; numMatches = 0; PhrasePositions pp = pq.Pop(); int matchLength = end - pp.position; int next = pq.Top.position; while (AdvancePP(pp)) { if (hasRpts && !AdvanceRpts(pp)) { break; // pps exhausted } if (pp.position > next) // done minimizing current match-length { if (matchLength <= slop) { freq += docScorer.ComputeSlopFactor(matchLength); // score match numMatches++; } pq.Add(pp); pp = pq.Pop(); next = pq.Top.position; matchLength = end - pp.position; } else { int matchLength2 = end - pp.position; if (matchLength2 < matchLength) { matchLength = matchLength2; } } } if (matchLength <= slop) { freq += docScorer.ComputeSlopFactor(matchLength); // score match numMatches++; } return(freq); }
public override int Advance(int target) { firstTime = false; for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { more = pp.SkipTo(target); } if (more) { Sort(); // re-sort } if (!DoNext()) { first.doc = NO_MORE_DOCS; } return(first.doc); }
/// <summary> /// pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser /// of the two colliding pps. Note that there can only be one collision, as by the initialization /// there were no collisions before pp was advanced. /// </summary> private bool AdvanceRpts(PhrasePositions pp) { if (pp.rptGroup < 0) { return(true); // not a repeater } PhrasePositions[] rg = rptGroups[pp.rptGroup]; FixedBitSet bits = new FixedBitSet(rg.Length); // for re-queuing after collisions are resolved int k0 = pp.rptInd; int k; while ((k = Collide(pp)) >= 0) { pp = Lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps if (!AdvancePP(pp)) { return(false); // exhausted } if (k != k0) // careful: mark only those currently in the queue { bits = FixedBitSet.EnsureCapacity(bits, k); bits.Set(k); // mark that pp2 need to be re-queued } } // collisions resolved, now re-queue // empty (partially) the queue until seeing all pps advanced for resolving collisions int n = 0; // TODO would be good if we can avoid calling cardinality() in each iteration! int numBits = bits.Length; // larges bit we set while (bits.Cardinality > 0) { PhrasePositions pp2 = pq.Pop(); rptStack[n++] = pp2; if (pp2.rptGroup >= 0 && pp2.rptInd < numBits && bits.Get(pp2.rptInd)) // this bit may not have been set { bits.Clear(pp2.rptInd); } } // add back to queue for (int i = n - 1; i >= 0; i--) { pq.Add(rptStack[i]); } return(true); }
/// <summary> /// At initialization (each doc), each repetition group is sorted by (query) offset. /// this provides the start condition: no collisions. /// <para/>Case 1: no multi-term repeats /// <para/> /// It is sufficient to advance each pp in the group by one less than its group index. /// So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc. /// <para/>Case 2: multi-term repeats /// </summary> /// <returns> <c>false</c> if PPs are exhausted. </returns> private bool AdvanceRepeatGroups() { foreach (PhrasePositions[] rg in rptGroups) { if (hasMultiTermRpts) { // more involved, some may not collide int incr; for (int i = 0; i < rg.Length; i += incr) { incr = 1; PhrasePositions pp = rg[i]; int k; while ((k = Collide(pp)) >= 0) { PhrasePositions pp2 = Lesser(pp, rg[k]); if (!AdvancePP(pp2)) // at initialization always advance pp with higher offset { return(false); // exhausted } if (pp2.rptInd < i) // should not happen? { incr = 0; break; } } } } else { // simpler, we know exactly how much to advance for (int j = 1; j < rg.Length; j++) { for (int k = 0; k < j; k++) { if (!rg[j].NextPosition()) { return(false); // PPs exhausted } } } } } return(true); // PPs available }
/// <summary> /// Find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts </summary> private PhrasePositions[] RepeatingPPs(IDictionary <Term, int?> rptTerms) { IList <PhrasePositions> rp = new JCG.List <PhrasePositions>(); for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max { foreach (Term t in pp.terms) { if (rptTerms.ContainsKey(t)) { rp.Add(pp); hasMultiTermRpts |= (pp.terms.Length > 1); break; } } } return(rp.ToArray()); }
// flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back. // assumes: pp!=pp2, pp2 in pq, pp not in pq. // called only when there are repeating pps. private PhrasePositions Flip(PhrasePositions pp, PhrasePositions pp2) { int n = 0; PhrasePositions pp3; //pop until finding pp2 while ((pp3 = (PhrasePositions)pq.Pop()) != pp2) { tmpPos[n++] = pp3; } //insert back all but pp2 for (n--; n >= 0; n--) { pq.Insert(tmpPos[n]); } //insert pp back pq.Put(pp); return(pp2); }
protected internal void PqToList() { last = first = null; while (pq.Top() != null) { PhrasePositions pp = pq.Pop(); if (last != null) { // add next to end of list last.next = pp; } else { first = pp; } last = pp; pp.next = null; } }
/// <summary> /// Find repeating terms and assign them ordinal values </summary> private JCG.LinkedDictionary <Term, int?> RepeatingTerms() { JCG.LinkedDictionary <Term, int?> tord = new JCG.LinkedDictionary <Term, int?>(); Dictionary <Term, int?> tcnt = new Dictionary <Term, int?>(); for (PhrasePositions pp = min, prev = null; prev != max; pp = (prev = pp).next) // iterate cyclic list: done once handled max { foreach (Term t in pp.terms) { tcnt.TryGetValue(t, out int?cnt0); int?cnt = cnt0 == null ? new int?(1) : new int?(1 + (int)cnt0); tcnt[t] = cnt; if (cnt == 2) { tord[t] = tord.Count; } } } return(tord); }
internal PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, byte[] norms) : base(similarity) { this.norms = norms; this.weight = weight; this.value_Renamed = weight.GetValue(); // convert tps to a list for (int i = 0; i < tps.Length; i++) { PhrasePositions pp = new PhrasePositions(tps[i], positions[i]); if (last != null) { // add next to end of list last.next = pp; } else first = pp; last = pp; } pq = new PhraseQueue(tps.Length); // construct empty pq }
/// <summary> We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences /// in the query of the same word would go elsewhere in the matched doc. /// </summary> /// <returns> null if differ (i.e. valid) otherwise return the higher offset PhrasePositions /// out of the first two PPs found to not differ. /// </returns> private PhrasePositions TermPositionsDiffer(PhrasePositions pp) { // efficiency note: a more efficient implementation could keep a map between repeating // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. // However this would complicate code, for a rather rare case, so choice is to compromise here. int tpPos = pp.position + pp.offset; for (int i = 0; i < repeats.Length; i++) { PhrasePositions pp2 = repeats[i]; if (pp2 == pp) { continue; } int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) { return(pp.offset > pp2.offset?pp:pp2); // do not differ: return the one with higher offset. } } return(null); }
/// <summary> We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences /// in the query of the same word would go elsewhere in the matched doc. /// </summary> /// <returns> null if differ (i.e. valid) otherwise return the higher offset PhrasePositions /// out of the first two PPs found to not differ. /// </returns> private PhrasePositions TermPositionsDiffer(PhrasePositions pp) { // efficiency note: a more efficient implementation could keep a map between repeating // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. // However this would complicate code, for a rather rare case, so choice is to compromise here. int tpPos = pp.position + pp.offset; for (int i = 0; i < repeats.Length; i++) { PhrasePositions pp2 = repeats[i]; if (pp2 == pp) continue; int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) return pp.offset > pp2.offset?pp:pp2; // do not differ: return the one with higher offset. } return null; }
protected internal void PqToList() { last = first = null; while (pq.Top() != null) { PhrasePositions pp = (PhrasePositions) pq.Pop(); if (last != null) { // add next to end of list last.next = pp; } else first = pp; last = pp; pp.next = null; } }
// flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back. // assumes: pp!=pp2, pp2 in pq, pp not in pq. // called only when there are repeating pps. private PhrasePositions Flip(PhrasePositions pp, PhrasePositions pp2) { int n = 0; PhrasePositions pp3; //pop until finding pp2 while ((pp3 = (PhrasePositions) pq.Pop()) != pp2) { tmpPos[n++] = pp3; } //insert back all but pp2 for (n--; n >= 0; n--) { pq.Insert(tmpPos[n]); } //insert pp back pq.Put(pp); return pp2; }