public override bool Collect(BytesRef bytes) { float boost = boostAtt.Boost; // make sure within a single seg we always collect // terms in order Debug.Assert(CompareToLastTerm(bytes)); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.Count == maxSize) { ScoreTerm t = stQueue.Peek(); if (boost < t.Boost) { return(true); } if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0) { return(true); } } TermState state = termsEnum.GetTermState(); Debug.Assert(state != null); if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2)) { // if the term is already in the PQ, only update docFreq of term in PQ Debug.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums"); t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.Bytes.CopyBytes(bytes); st.Boost = boost; visitedTerms[st.Bytes] = st; Debug.Assert(st.TermState.DocFreq == 0); st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); stQueue.Add(st); // possibly drop entries from queue if (stQueue.Count > maxSize) { st = stQueue.Dequeue(); visitedTerms.Remove(st.Bytes); st.TermState.Clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(m_topReaderContext)); } Debug.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize"); // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.Count == maxSize) { t2 = stQueue.Peek(); maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost; maxBoostAtt.CompetitiveTerm = t2.Bytes; } } return(true); }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer is null) { // LUCENENET: Changed from NullPointerException to InvalidOperationException (which isn't caught anywhere outside of tests) throw IllegalStateException.Create("PassageScorer cannot be null"); } JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de is null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de is null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) => { if (left.score < right.score) { return(-1); } else if (left.score > right.score) { return(1); } else { return(left.startOffset - right.startOffset); } })); Passage current = new Passage(); while (pq.TryDequeue(out OffsetsEnum off)) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. if (Debugging.AssertsEnabled) { Debugging.Assert(EMPTY.StartOffset == int.MaxValue); } if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Enqueue(current); if (passageQueue.Count > n) { current = passageQueue.Dequeue(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset)); return(passages); } // advance breakiterator if (Debugging.AssertsEnabled) { Debugging.Assert(BreakIterator.Done < 0); } current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term is null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Enqueue(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: if (Debugging.AssertsEnabled) { Debugging.Assert(false); } return(null); }
public override bool Collect(BytesRef bytes) { float boost = boostAtt.Boost; // make sure within a single seg we always collect // terms in order if (Debugging.AssertsEnabled) { Debugging.Assert(CompareToLastTerm(bytes)); } //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.Count == maxSize) { ScoreTerm t = stQueue.Peek(); // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled if (NumericUtils.SingleToSortableInt32(boost) < NumericUtils.SingleToSortableInt32(t.Boost)) { return(true); } // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled if (NumericUtils.SingleToSortableInt32(boost) == NumericUtils.SingleToSortableInt32(t.Boost) && termComp.Compare(bytes, t.Bytes) > 0) { return(true); } } TermState state = termsEnum.GetTermState(); if (Debugging.AssertsEnabled) { Debugging.Assert(state != null); } if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2)) { // if the term is already in the PQ, only update docFreq of term in PQ // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled if (Debugging.AssertsEnabled) { Debugging.Assert(NumericUtils.SingleToSortableInt32(t2.Boost) == NumericUtils.SingleToSortableInt32(boost), "boost should be equal in all segment TermsEnums"); } t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.Bytes.CopyBytes(bytes); st.Boost = boost; visitedTerms[st.Bytes] = st; if (Debugging.AssertsEnabled) { Debugging.Assert(st.TermState.DocFreq == 0); } st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); stQueue.Add(st); // possibly drop entries from queue if (stQueue.Count > maxSize) { st = stQueue.Dequeue(); visitedTerms.Remove(st.Bytes); st.TermState.Clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(m_topReaderContext)); } if (Debugging.AssertsEnabled) { Debugging.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize"); } // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.Count == maxSize) { t2 = stQueue.Peek(); maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost; maxBoostAtt.CompetitiveTerm = t2.Bytes; } } return(true); }