internal bool SkipTo(int target) { Doc = Postings.Advance(target); if (Doc == DocIdSetIterator.NO_MORE_DOCS) { return(false); } return(true); }
public override sealed int Advance(int target) { while (_queue.Top != null && target > _queue.Top.DocID) { DocsAndPositionsEnum postings = _queue.Pop(); if (postings.Advance(target) != NO_MORE_DOCS) { _queue.Add(postings); } } return(NextDoc()); }
public virtual void TestDocsAndPositionsEnum() { TermsEnum termsEnum = reader.GetTerms(DOC_POSITIONS_FIELD).GetIterator(null); assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOC_POSITIONS_TERM))); DocsAndPositionsEnum sortedPositions = termsEnum.DocsAndPositions(null, null); int doc; // test nextDoc() while ((doc = sortedPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.Freq; assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition()); if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset); } assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture)); } } // test advance() DocsAndPositionsEnum reuse = sortedPositions; sortedPositions = termsEnum.DocsAndPositions(null, reuse); if (sortedPositions is SortingAtomicReader.SortingDocsAndPositionsEnum) { assertTrue(((SortingAtomicReader.SortingDocsAndPositionsEnum)sortedPositions).Reused(reuse)); // make sure reuse worked } doc = 0; while ((doc = sortedPositions.Advance(doc + TestUtil.NextInt32(Random, 1, 5))) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.Freq; assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition()); if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset); } assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture)); } } }
/// <summary> /// Read the parents of the new categories /// </summary> private void InitParents(IndexReader reader, int first) { if (reader.MaxDoc == first) { return; } // it's ok to use MultiFields because we only iterate on one posting list. // breaking it to loop over the leaves() only complicates code for no // apparent gain. DocsAndPositionsEnum positions = MultiFields.GetTermPositionsEnum(reader, null, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, DocsAndPositionsFlags.PAYLOADS); // shouldn't really happen, if it does, something's wrong if (positions == null || positions.Advance(first) == DocIdSetIterator.NO_MORE_DOCS) { throw new CorruptIndexException("Missing parent data for category " + first); } int num = reader.MaxDoc; for (int i = first; i < num; i++) { if (positions.DocID == i) { if (positions.Freq == 0) // shouldn't happen { throw new CorruptIndexException("Missing parent data for category " + i); } parents[i] = positions.NextPosition(); if (positions.NextDoc() == DocIdSetIterator.NO_MORE_DOCS) { if (i + 1 < num) { throw new CorruptIndexException("Missing parent data for category " + (i + 1)); } break; } } // this shouldn't happen else { throw new CorruptIndexException("Missing parent data for category " + i); } } }
/// <summary> /// checks advancing docs + positions /// </summary> public virtual void AssertPositionsSkipping(int docFreq, DocsAndPositionsEnum leftDocs, DocsAndPositionsEnum rightDocs) { if (leftDocs == null || rightDocs == null) { Assert.IsNull(leftDocs); Assert.IsNull(rightDocs); return; } int docid = -1; int averageGap = MAXDOC / (1 + docFreq); int skipInterval = 16; while (true) { if (Random.NextBoolean()) { // nextDoc() docid = leftDocs.NextDoc(); Assert.AreEqual(docid, rightDocs.NextDoc()); } else { // advance() int skip = docid + (int)Math.Ceiling(Math.Abs(skipInterval + Random.NextDouble() * averageGap)); docid = leftDocs.Advance(skip); Assert.AreEqual(docid, rightDocs.Advance(skip)); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } int freq = leftDocs.Freq; Assert.AreEqual(freq, rightDocs.Freq); for (int i = 0; i < freq; i++) { Assert.AreEqual(leftDocs.NextPosition(), rightDocs.NextPosition()); // we don't compare the payloads, its allowed that one is empty etc } } }
public override int FetchDoc(int targetDoc) { if (targetDoc <= m_curDoc) { targetDoc = m_curDoc + 1; } if ((m_curDoc = m_dp.Advance(targetDoc)) != DocsEnum.NO_MORE_DOCS) { m_posLeft = m_dp.Freq; m_curSec = -1; m_curPos = -1; return(m_curDoc); } else { m_curDoc = DocIdSetIterator.NO_MORE_DOCS; return(m_curDoc); } }
public static ISet <int> HighlightedOffsets(int pagenumber, string pattern, DirectoryInfo dataFolder) { using (Analyzer analyzer = new SimpleAnalyzer(LuceneVersion.LUCENE_48)) using (IndexReader reader = new PagesReader(dataFolder)) { Query query = new QueryParser(LuceneVersion.LUCENE_48, string.Empty, analyzer).Parse(pattern); TermsEnum termsEnum = reader.GetTermVector(pagenumber, string.Empty).GetIterator(TermsEnum.EMPTY); IEnumerable <int> highlightOffsets = ExtractTerms(query).SelectMany(term => { termsEnum.SeekExact(term.Bytes); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); docsAndPositionsEnum.Advance(pagenumber); return(Enumerable.Range(0, docsAndPositionsEnum.Freq).Select(i => { docsAndPositionsEnum.NextPosition(); return docsAndPositionsEnum.StartOffset; })); }); return(new HashSet <int>(highlightOffsets)); } }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer is null) { // LUCENENET: Changed from NullPointerException to InvalidOperationException (which isn't caught anywhere outside of tests) throw IllegalStateException.Create("PassageScorer cannot be null"); } JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de is null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de is null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) => { if (left.score < right.score) { return(-1); } else if (left.score > right.score) { return(1); } else { return(left.startOffset - right.startOffset); } })); Passage current = new Passage(); while (pq.TryDequeue(out OffsetsEnum off)) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. if (Debugging.AssertsEnabled) { Debugging.Assert(EMPTY.StartOffset == int.MaxValue); } if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Enqueue(current); if (passageQueue.Count > n) { current = passageQueue.Dequeue(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset)); return(passages); } // advance breakiterator if (Debugging.AssertsEnabled) { Debugging.Assert(BreakIterator.Done < 0); } current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term is null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Enqueue(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: if (Debugging.AssertsEnabled) { Debugging.Assert(false); } return(null); }
private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query) { IDictionary <int, object> highlights = new Dictionary <int, object>(); PassageFormatter fieldFormatter = GetFormatter(field); if (fieldFormatter is null) { // LUCENENET: Changed from NullPointerException to InvalidOperationException (which isn't caught anywhere outside of tests) throw IllegalStateException.Create("PassageFormatter cannot be null"); } // check if we should do any multiterm processing Analyzer analyzer = GetIndexAnalyzer(field); CharacterRunAutomaton[] automata = Arrays.Empty <CharacterRunAutomaton>(); if (analyzer != null) { automata = MultiTermHighlighting.ExtractAutomata(query, field); } // resize 'terms', where the last term is the multiterm matcher if (automata.Length > 0) { BytesRef[] newTerms = new BytesRef[terms.Length + 1]; System.Array.Copy(terms, 0, newTerms, 0, terms.Length); terms = newTerms; } // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes // otherwise, we will just advance() existing enums to the new document in the same segment. DocsAndPositionsEnum[] postings = null; TermsEnum termsEnum = null; int lastLeaf = -1; for (int i = 0; i < docids.Length; i++) { string content = contents[i]; if (content.Length == 0) { continue; // nothing to do } bi.SetText(content); int doc = docids[i]; int leaf = ReaderUtil.SubIndex(doc, leaves); AtomicReaderContext subContext = leaves[leaf]; AtomicReader r = subContext.AtomicReader; if (Debugging.AssertsEnabled) { Debugging.Assert(leaf >= lastLeaf); // increasing order } // if the segment has changed, we must initialize new enums. if (leaf != lastLeaf) { Terms t = r.GetTerms(field); if (t != null) { termsEnum = t.GetEnumerator(); postings = new DocsAndPositionsEnum[terms.Length]; } } if (termsEnum is null) { continue; // no terms for this field, nothing to do } // if there are multi-term matches, we have to initialize the "fake" enum for each document if (automata.Length > 0) { DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata); dp.Advance(doc - subContext.DocBase); postings[terms.Length - 1] = dp; // last term is the multiterm matcher } Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages); if (passages.Length == 0) { // no passages were returned, so ask for a default summary passages = GetEmptyHighlight(field, bi, maxPassages); } if (passages.Length > 0) { highlights[doc] = fieldFormatter.Format(passages, content); } lastLeaf = leaf; } return(highlights); }