//public static void main( string[] args ) throws Exception { // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); // Query query = parser.parse( "a x:b" ); // FieldQuery fieldQuery = new FieldQuery( query, true, false ); // Directory dir = new RAMDirectory(); // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); // Document doc = new Document(); // IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED); // ft.setStoreTermVectors(true); // ft.setStoreTermVectorOffsets(true); // ft.setStoreTermVectorPositions(true); // doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) ); // doc.add( new Field( "f", ft, "b a b a f" ) ); // writer.addDocument( doc ); // writer.close(); // IndexReader reader = IndexReader.open(dir1); // new FieldTermStack( reader, 0, "f", fieldQuery ); // reader.close(); //} /// <summary> /// a constructor. /// </summary> /// <param name="reader"><see cref="IndexReader"/> of the index</param> /// <param name="docId">document id to be highlighted</param> /// <param name="fieldName">field of the document to be highlighted</param> /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; ISet <string> termSet = fieldQuery.GetTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } Fields vectors = reader.GetTermVectors(docId); if (vectors == null) { // null snippet return; } Terms vector = vectors.GetTerms(fieldName); if (vector == null) { // null snippet return; } CharsRef spare = new CharsRef(); TermsEnum termsEnum = vector.GetIterator(null); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.MaxDoc; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); string term = spare.ToString(); if (!termSet.Contains(term)) { continue; } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.NextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0); int freq = dpEnum.Freq; for (int i = 0; i < freq; i++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { return; // no offsets, null snippet } termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight)); } } // sort by position CollectionUtil.TimSort(termList); // now look for dups at the same position, linking them together int currentPos = -1; TermInfo previous = null; TermInfo first = null; for (int i = 0; i < termList.Count;) { TermInfo current = termList[i]; if (current.Position == currentPos) { Debug.Assert(previous != null); previous.SetNext(current); previous = current; //iterator.Remove(); // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item) termList.RemoveAt(i); } else { if (previous != null) { previous.SetNext(first); } previous = first = current; currentPos = current.Position; // LUCENENET NOTE: Only increment the position if we don't do a delete. i++; } } if (previous != null) { previous.SetNext(first); } }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { if (outerInstance.fieldInfo.IndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { // Positions were not indexed: return(null); } DecodeMetaData(); return(outerInstance.outerInstance.postingsReader.DocsAndPositions(outerInstance.fieldInfo, state, liveDocs, reuse, flags)); }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { throw new System.NotSupportedException(); }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { if (_fieldReader._fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { // Positions were not indexed: return null; } DecodeMetaData(); return _blockTermsReader._postingsReader.DocsAndPositions(_fieldReader._fieldInfo, _state, liveDocs, reuse, flags); }
// for testing internal virtual bool reused(DocsAndPositionsEnum other) { if (other == null || !(other is SortingDocsAndPositionsEnum)) { return false; } return docs == ((SortingDocsAndPositionsEnum) other).docs; }
public override DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { bool hasOffsets = field.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; if (field.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { return null; } decodeMetaData(); FSTDocsAndPositionsEnum docsAndPositionsEnum; if (reuse == null || !(reuse is FSTDocsAndPositionsEnum)) { docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.hasPayloads(), hasOffsets); } else { docsAndPositionsEnum = (FSTDocsAndPositionsEnum) reuse; if (!docsAndPositionsEnum.canReuse(field.hasPayloads(), hasOffsets)) { docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.hasPayloads(), hasOffsets); } } //System.out.println("D&P reset this=" + this); return docsAndPositionsEnum.reset(postingsSpare, liveDocs, docFreq_Renamed); }
// only for EmptyTermSpans (below) internal TermSpans() { Term = null; Postings_Renamed = null; }
/// <summary> /// checks docs + freqs + positions + payloads, sequentially /// </summary> public virtual void AssertDocsAndPositionsEnum(DocsAndPositionsEnum leftDocs, DocsAndPositionsEnum rightDocs) { if (leftDocs == null || rightDocs == null) { Assert.IsNull(leftDocs); Assert.IsNull(rightDocs); return; } Assert.AreEqual(-1, leftDocs.DocID()); Assert.AreEqual(-1, rightDocs.DocID()); int docid; while ((docid = leftDocs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { Assert.AreEqual(docid, rightDocs.NextDoc()); int freq = leftDocs.Freq(); Assert.AreEqual(freq, rightDocs.Freq()); for (int i = 0; i < freq; i++) { Assert.AreEqual(leftDocs.NextPosition(), rightDocs.NextPosition()); // we don't assert offsets/payloads, they are allowed to be different } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, rightDocs.NextDoc()); }
/// <summary> /// checks advancing docs + positions /// </summary> public virtual void AssertPositionsSkipping(int docFreq, DocsAndPositionsEnum leftDocs, DocsAndPositionsEnum rightDocs) { if (leftDocs == null || rightDocs == null) { Assert.IsNull(leftDocs); Assert.IsNull(rightDocs); return; } int docid = -1; int averageGap = MAXDOC / (1 + docFreq); int skipInterval = 16; while (true) { if (Random().NextBoolean()) { // nextDoc() docid = leftDocs.NextDoc(); Assert.AreEqual(docid, rightDocs.NextDoc()); } else { // advance() int skip = docid + (int)Math.Ceiling(Math.Abs(skipInterval + Random().NextDouble() * averageGap)); docid = leftDocs.Advance(skip); Assert.AreEqual(docid, rightDocs.Advance(skip)); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } int freq = leftDocs.Freq(); Assert.AreEqual(freq, rightDocs.Freq()); for (int i = 0; i < freq; i++) { Assert.AreEqual(leftDocs.NextPosition(), rightDocs.NextPosition()); // we don't compare the payloads, its allowed that one is empty etc } } }
// only for EmptyTermSpans (below) internal TermSpans() { m_term = null; m_postings = null; }
/// <summary> /// checks the terms enum sequentially /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums /// </summary> public virtual void AssertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep) { BytesRef term; Bits randomBits = new RandomBits(MAXDOC, Random().NextDouble(), Random()); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while ((term = leftTermsEnum.Next()) != null) { Assert.AreEqual(term, rightTermsEnum.Next()); AssertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { // with payloads + off AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); // with payloads only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); // with offsets only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); // with positions only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); // with freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); // with freqs: AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE)); AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); } } Assert.IsNull(rightTermsEnum.Next()); }
public TermSpans(DocsAndPositionsEnum postings, Term term) { this.m_postings = postings; this.m_term = term; m_doc = -1; }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare() if (IndexOptionsComparer.Default.Compare(outerInstance.fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { // Positions were not indexed: return(null); } DecodeMetaData(); return(outerInstance.outerInstance.postingsReader.DocsAndPositions(outerInstance.fieldInfo, state, liveDocs, reuse, flags)); }
public virtual void TestMixedVectrosVectors() { RandomIndexWriter writer = new RandomIndexWriter(Random, Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE)); Document doc = new Document(); FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.StoreTermVectors = true; FieldType ft3 = new FieldType(TextField.TYPE_STORED); ft3.StoreTermVectors = true; ft3.StoreTermVectorPositions = true; FieldType ft4 = new FieldType(TextField.TYPE_STORED); ft4.StoreTermVectors = true; ft4.StoreTermVectorOffsets = true; FieldType ft5 = new FieldType(TextField.TYPE_STORED); ft5.StoreTermVectors = true; ft5.StoreTermVectorOffsets = true; ft5.StoreTermVectorPositions = true; doc.Add(NewTextField("field", "one", Field.Store.YES)); doc.Add(NewField("field", "one", ft2)); doc.Add(NewField("field", "one", ft3)); doc.Add(NewField("field", "one", ft4)); doc.Add(NewField("field", "one", ft5)); writer.AddDocument(doc); IndexReader reader = writer.GetReader(); writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc); Assert.IsNotNull(vectors); Assert.AreEqual(1, vectors.Count); Terms vector = vectors.GetTerms("field"); Assert.IsNotNull(vector); Assert.AreEqual(1, vector.Count); TermsEnum termsEnum = vector.GetIterator(null); Assert.IsNotNull(termsEnum.Next()); Assert.AreEqual("one", termsEnum.Term.Utf8ToString()); Assert.AreEqual(5, termsEnum.TotalTermFreq); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, dpEnum.NextPosition()); } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq); for (int i = 0; i < 5; i++) { dpEnum.NextPosition(); Assert.AreEqual(4 * i, dpEnum.StartOffset); Assert.AreEqual(4 * i + 3, dpEnum.EndOffset); } reader.Dispose(); }
internal SortingDocsAndPositionsEnum(int maxDoc, SortingDocsAndPositionsEnum reuse, DocsAndPositionsEnum @in, Sorter.DocMap docMap, bool storeOffsets) : base(@in) { this.maxDoc = maxDoc; this.storeOffsets = storeOffsets; if (reuse != null) { docs = reuse.docs; offsets = reuse.offsets; payload = reuse.payload; file = reuse.file; if (reuse.maxDoc == maxDoc) { sorter = reuse.sorter; } else { sorter = new DocOffsetSorter(maxDoc); } } else { docs = new int[32]; offsets = new long[32]; payload = new BytesRef(32); file = new RAMFile(); sorter = new DocOffsetSorter(maxDoc); } using (IndexOutput @out = new RAMOutputStream(file)) { int doc; int i = 0; while ((doc = @in.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (i == docs.Length) { int newLength = ArrayUtil.Oversize(i + 1, 4); docs = Arrays.CopyOf(docs, newLength); offsets = Arrays.CopyOf(offsets, newLength); } docs[i] = docMap.OldToNew(doc); offsets[i] = @out.FilePointer; AddPositions(@in, @out); i++; } upto = i; sorter.Reset(docs, offsets); sorter.Sort(0, upto); } this.postingInput = new RAMInputStream("", file); }
private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query) { IDictionary <int, object> highlights = new Dictionary <int, object>(); PassageFormatter fieldFormatter = GetFormatter(field); if (fieldFormatter == null) { throw new NullReferenceException("PassageFormatter cannot be null"); } // check if we should do any multiterm processing Analyzer analyzer = GetIndexAnalyzer(field); CharacterRunAutomaton[] automata = new CharacterRunAutomaton[0]; if (analyzer != null) { automata = MultiTermHighlighting.ExtractAutomata(query, field); } // resize 'terms', where the last term is the multiterm matcher if (automata.Length > 0) { BytesRef[] newTerms = new BytesRef[terms.Length + 1]; System.Array.Copy(terms, 0, newTerms, 0, terms.Length); terms = newTerms; } // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes // otherwise, we will just advance() existing enums to the new document in the same segment. DocsAndPositionsEnum[] postings = null; TermsEnum termsEnum = null; int lastLeaf = -1; for (int i = 0; i < docids.Length; i++) { string content = contents[i]; if (content.Length == 0) { continue; // nothing to do } bi.SetText(content); int doc = docids[i]; int leaf = ReaderUtil.SubIndex(doc, leaves); AtomicReaderContext subContext = leaves[leaf]; AtomicReader r = subContext.AtomicReader; Debug.Assert(leaf >= lastLeaf); // increasing order // if the segment has changed, we must initialize new enums. if (leaf != lastLeaf) { Terms t = r.GetTerms(field); if (t != null) { termsEnum = t.GetIterator(null); postings = new DocsAndPositionsEnum[terms.Length]; } } if (termsEnum == null) { continue; // no terms for this field, nothing to do } // if there are multi-term matches, we have to initialize the "fake" enum for each document if (automata.Length > 0) { DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata); dp.Advance(doc - subContext.DocBase); postings[terms.Length - 1] = dp; // last term is the multiterm matcher } Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages); if (passages.Length == 0) { // no passages were returned, so ask for a default summary passages = GetEmptyHighlight(field, bi, maxPassages); } if (passages.Length > 0) { highlights[doc] = fieldFormatter.Format(passages, content); } lastLeaf = leaf; } return(highlights); }
public override DocsAndPositionsEnum DocsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { var termState = (PulsingTermState) _termState; if (termState.PostingsSize != -1) { PulsingDocsAndPositionsEnum postings; if (reuse is PulsingDocsAndPositionsEnum) { postings = (PulsingDocsAndPositionsEnum) reuse; if (!postings.CanReuse(field)) { postings = new PulsingDocsAndPositionsEnum(field); } } else { // the 'reuse' is actually the wrapped enum var previous = (PulsingDocsAndPositionsEnum) GetOther(reuse); if (previous != null && previous.CanReuse(field)) { postings = previous; } else { postings = new PulsingDocsAndPositionsEnum(field); } } if (reuse != postings) { SetOther(postings, reuse); // postings.other = reuse } return postings.Reset(liveDocs, termState); } if (!(reuse is PulsingDocsAndPositionsEnum)) return _wrappedPostingsReader.DocsAndPositions(field, termState.WrappedTermState, liveDocs, reuse, flags); var wrapped = _wrappedPostingsReader.DocsAndPositions(field, termState.WrappedTermState, liveDocs, (DocsAndPositionsEnum) GetOther(reuse), flags); SetOther(wrapped, reuse); // wrapped.other = reuse return wrapped; }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer == null) { throw new NullReferenceException("PassageScorer cannot be null"); } JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de == null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, new HighlightDocComparerAnonymousHelper1()); Passage current = new Passage(); while (pq.TryDequeue(out OffsetsEnum off)) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. Debug.Assert(EMPTY.StartOffset == int.MaxValue); if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Enqueue(current); if (passageQueue.Count > n) { current = passageQueue.Dequeue(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, new HighlightDocComparerAnonymousHelper2()); return(passages); } // advance breakiterator Debug.Assert(BreakIterator.Done < 0); current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); Debug.Assert(term != null); } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Enqueue(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: Debug.Assert(false); return(null); }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { PreDocsAndPositionsEnum docsPosEnum; if (fieldInfo.FieldIndexOptions != FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { return null; } else if (reuse == null || !(reuse is PreDocsAndPositionsEnum)) { docsPosEnum = new PreDocsAndPositionsEnum(OuterInstance); } else { docsPosEnum = (PreDocsAndPositionsEnum)reuse; if (docsPosEnum.FreqStream != OuterInstance.FreqStream) { docsPosEnum = new PreDocsAndPositionsEnum(OuterInstance); } } return docsPosEnum.Reset(TermEnum, liveDocs); }
internal OffsetsEnum(DocsAndPositionsEnum dp, int id) { this.dp = dp; this.id = id; this.pos = 1; }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { if (!outerInstance.hasPos) { return null; } // TODO: implement reuse, something like Pulsing: // it's hairy! if (outerInstance.terms[termOrd] is LowFreqTerm) { LowFreqTerm term = ((LowFreqTerm) outerInstance.terms[termOrd]); int[] postings = term.postings; byte[] payloads = term.payloads; return (new LowFreqDocsAndPositionsEnum(liveDocs, outerInstance.hasOffsets_Renamed, outerInstance.hasPayloads_Renamed)).Reset(postings, payloads); } else { HighFreqTerm term = (HighFreqTerm) outerInstance.terms[termOrd]; return (new HighFreqDocsAndPositionsEnum(liveDocs, outerInstance.hasOffsets_Renamed)).Reset( term.docIDs, term.freqs, term.positions, term.payloads); } }
private readonly IPayloadAttribute payloadAttribute; // LUCENENET: marked readonly ///<summary>Constructor</summary> /// <param name="vector"> /// Terms that contains the data for /// creating the <see cref="TokenStream"/>. Must have positions and offsets. /// </param> public TokenStreamFromTermPositionVector(Terms vector) { termAttribute = AddAttribute <ICharTermAttribute>(); positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); payloadAttribute = AddAttribute <IPayloadAttribute>(); bool hasOffsets = vector.HasOffsets; bool hasPayloads = vector.HasPayloads; TermsEnum termsEnum = vector.GetEnumerator(); BytesRef text; DocsAndPositionsEnum dpEnum = null; while (termsEnum.MoveNext()) { text = termsEnum.Term; dpEnum = termsEnum.DocsAndPositions(null, dpEnum); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int j = 0; j < freq; j++) { int pos = dpEnum.NextPosition(); Token token; if (hasOffsets) { token = new Token(text.Utf8ToString(), dpEnum.StartOffset, dpEnum.EndOffset); } else { token = new Token(); token.SetEmpty().Append(text.Utf8ToString()); } if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } // Yes - this is the position, not the increment! This is for // sorting. This value // will be corrected before use. token.PositionIncrement = pos; this.positionedTokens.Add(token); } } CollectionUtil.TimSort(this.positionedTokens, tokenComparer); int lastPosition = -1; foreach (Token token in this.positionedTokens) { int thisPosition = token.PositionIncrement; token.PositionIncrement = thisPosition - lastPosition; lastPosition = thisPosition; } this.tokensAtCurrentPosition = this.positionedTokens.GetEnumerator(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: SortingDocsAndPositionsEnum(int maxDoc, SortingDocsAndPositionsEnum reuse, final org.apache.lucene.index.DocsAndPositionsEnum in, Sorter.DocMap docMap, boolean storeOffsets) throws java.io.IOException //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: internal SortingDocsAndPositionsEnum(int maxDoc, SortingDocsAndPositionsEnum reuse, DocsAndPositionsEnum @in, Sorter.DocMap docMap, bool storeOffsets) : base(@in) { this.maxDoc = maxDoc; this.storeOffsets = storeOffsets; if (reuse != null) { docs = reuse.docs; offsets = reuse.offsets; payload = reuse.payload; file = reuse.file; if (reuse.maxDoc == maxDoc) { sorter = reuse.sorter; } else { sorter = new DocOffsetSorter(maxDoc); } } else { docs = new int[32]; offsets = new long[32]; payload = new BytesRef(32); file = new RAMFile(); sorter = new DocOffsetSorter(maxDoc); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.store.IndexOutput out = new org.apache.lucene.store.RAMOutputStream(file); IndexOutput @out = new RAMOutputStream(file); int doc; int i = 0; while ((doc = @in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (i == docs.Length) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int newLength = org.apache.lucene.util.ArrayUtil.oversize(i + 1, 4); int newLength = ArrayUtil.oversize(i + 1, 4); docs = Arrays.copyOf(docs, newLength); offsets = Arrays.copyOf(offsets, newLength); } docs[i] = docMap.oldToNew(doc); offsets[i] = @out.FilePointer; addPositions(@in, @out); i++; } upto = i; sorter.reset(docs, offsets); sorter.sort(0, upto); @out.close(); this.postingInput = new RAMInputStream("", file); }
public override DocsAndPositionsEnum DocsAndPositions(FieldInfo fieldInfo, BlockTermState termState, IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare() bool hasOffsets = IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; // TODO: can we optimize if FLAG_PAYLOADS / FLAG_OFFSETS // isn't passed? // TODO: refactor if (fieldInfo.HasPayloads || hasOffsets) { // If you are using ParellelReader, and pass in a // reused DocsEnum, it could have come from another // reader also using standard codec if (reuse is null || !(reuse is SegmentFullPositionsEnum docsEnum) || docsEnum.startFreqIn != freqIn) { docsEnum = new SegmentFullPositionsEnum(this, freqIn, proxIn); } return(docsEnum.Reset(fieldInfo, (StandardTermState)termState, liveDocs)); } else { // If you are using ParellelReader, and pass in a // reused DocsEnum, it could have come from another // reader also using standard codec if (reuse is null || !(reuse is SegmentDocsAndPositionsEnum docsEnum) || docsEnum.startFreqIn != freqIn) { docsEnum = new SegmentDocsAndPositionsEnum(this, freqIn, proxIn); } return(docsEnum.Reset(fieldInfo, (StandardTermState)termState, liveDocs)); } }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { if (!StorePositions && !StoreOffsets) { return null; } TVDocsAndPositionsEnum docsAndPositionsEnum; if (reuse != null && reuse is TVDocsAndPositionsEnum) { docsAndPositionsEnum = (TVDocsAndPositionsEnum)reuse; } else { docsAndPositionsEnum = new TVDocsAndPositionsEnum(); } docsAndPositionsEnum.Reset(liveDocs, Positions, StartOffsets, EndOffsets, PayloadOffsets, PayloadData); return docsAndPositionsEnum; }
internal SortingDocsAndPositionsEnum(int maxDoc, SortingDocsAndPositionsEnum reuse, DocsAndPositionsEnum @in, Sorter.DocMap docMap, bool storeOffsets) : base(@in) { this.maxDoc = maxDoc; this.storeOffsets = storeOffsets; if (reuse != null) { docs = reuse.docs; offsets = reuse.offsets; payload = reuse.payload; file = reuse.file; if (reuse.maxDoc == maxDoc) { sorter = reuse.sorter; } else { sorter = new DocOffsetSorter(maxDoc); } } else { docs = new int[32]; offsets = new long[32]; payload = new BytesRef(32); file = new RAMFile(); sorter = new DocOffsetSorter(maxDoc); } using (IndexOutput @out = new RAMOutputStream(file)) { int doc; int i = 0; while ((doc = @in.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (i == docs.Length) { int newLength = ArrayUtil.Oversize(i + 1, 4); docs = Arrays.CopyOf(docs, newLength); offsets = Arrays.CopyOf(offsets, newLength); } docs[i] = docMap.OldToNew(doc); offsets[i] = @out.GetFilePointer(); AddPositions(@in, @out); i++; } upto = i; sorter.Reset(docs, offsets); sorter.Sort(0, upto); } this.postingInput = new RAMInputStream("", file); }
public virtual void TestSetPosition() { Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this); Directory store = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), store, analyzer); Document d = new Document(); d.Add(NewTextField("field", "bogus", Field.Store.YES)); writer.AddDocument(d); IndexReader reader = writer.Reader; writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); DocsAndPositionsEnum pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("1")); pos.NextDoc(); // first token should be at position 0 Assert.AreEqual(0, pos.NextPosition()); pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("2")); pos.NextDoc(); // second token should be at position 2 Assert.AreEqual(2, pos.NextPosition()); PhraseQuery q; ScoreDoc[] hits; q = new PhraseQuery(); q.Add(new Term("field", "1")); q.Add(new Term("field", "2")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // same as previous, just specify positions explicitely. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 1); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // specifying correct positions should find the phrase. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 2); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "3")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // phrase query would find it when correct positions are specified. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "4"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); // phrase query should fail for non existing searched term // even if there exist another searched terms in the same searched position. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "9"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // multi-phrase query should succed for non existing searched term // because there exist another searched terms in the same searched position. MultiPhraseQuery mq = new MultiPhraseQuery(); mq.Add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0); hits = searcher.Search(mq, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "4")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); reader.Dispose(); store.Dispose(); }
public override Spans GetSpans(AtomicReaderContext context, IBits acceptDocs, IDictionary <Term, TermContext> termContexts) { TermContext termContext; termContexts.TryGetValue(m_term, out termContext); TermState state; if (termContext == null) { // this happens with span-not query, as it doesn't include the NOT side in extractTerms() // so we seek to the term now in this segment..., this sucks because its ugly mostly! Fields fields = context.AtomicReader.Fields; if (fields != null) { Terms terms = fields.GetTerms(m_term.Field); if (terms != null) { TermsEnum termsEnum = terms.GetEnumerator(); if (termsEnum.SeekExact(m_term.Bytes)) { state = termsEnum.GetTermState(); } else { state = null; } } else { state = null; } } else { state = null; } } else { state = termContext.Get(context.Ord); } if (state == null) // term is not present in that reader { return(TermSpans.EMPTY_TERM_SPANS); } TermsEnum termsEnum_ = context.AtomicReader.GetTerms(m_term.Field).GetEnumerator(); termsEnum_.SeekExact(m_term.Bytes, state); DocsAndPositionsEnum postings = termsEnum_.DocsAndPositions(acceptDocs, null, DocsAndPositionsFlags.PAYLOADS); if (postings != null) { return(new TermSpans(postings, m_term)); } else { // term does exist, but has no positions throw new InvalidOperationException("field \"" + m_term.Field + "\" was indexed without position data; cannot run SpanTermQuery (term=" + m_term.Text() + ")"); } }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { return(new RAMDocsAndPositionsEnum(ramField.termToDocs[current], liveDocs)); }
public virtual void TestWickedLongTerm() { using (RAMDirectory dir = new RAMDirectory()) { char[] chars = new char[IndexWriter.MAX_TERM_LENGTH]; Arrays.Fill(chars, 'x'); string bigTerm = new string(chars); Document doc = new Document(); using (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)))) { // This produces a too-long term: string contents = "abc xyz x" + bigTerm + " another term"; doc.Add(new TextField("content", contents, Field.Store.NO)); writer.AddDocument(doc); // Make sure we can add another normal document doc = new Document(); doc.Add(new TextField("content", "abc bbb ccc", Field.Store.NO)); writer.AddDocument(doc); } #pragma warning disable 612, 618 using (IndexReader reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { // Make sure all terms < max size were indexed assertEquals(2, reader.DocFreq(new Term("content", "abc"))); assertEquals(1, reader.DocFreq(new Term("content", "bbb"))); assertEquals(1, reader.DocFreq(new Term("content", "term"))); assertEquals(1, reader.DocFreq(new Term("content", "another"))); // Make sure position is still incremented when // massive term is skipped: DocsAndPositionsEnum tps = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "content", new BytesRef("another")); assertTrue(tps.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, tps.Freq); assertEquals(3, tps.NextPosition()); // Make sure the doc that has the massive term is in // the index: assertEquals("document with wicked long term should is not in the index!", 2, reader.NumDocs); } // Make sure we can add a document with exactly the // maximum length term, and search on that term: doc = new Document(); doc.Add(new TextField("content", bigTerm, Field.Store.NO)); ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); sa.MaxTokenLength = 100000; using (var writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa))) { writer.AddDocument(doc); } #pragma warning disable 612, 618 using (var reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { assertEquals(1, reader.DocFreq(new Term("content", bigTerm))); } } }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { DocsAndPositionsEnum inReuse; SortingDocsAndPositionsEnum wrapReuse; if (reuse != null && reuse is SortingDocsAndPositionsEnum) { // if we're asked to reuse the given DocsEnum and it is Sorting, return // the wrapped one, since some Codecs expect it. wrapReuse = (SortingDocsAndPositionsEnum)reuse; inReuse = wrapReuse.Wrapped; } else { wrapReuse = null; inReuse = reuse; } DocsAndPositionsEnum inDocsAndPositions = @in.DocsAndPositions(NewToOld(liveDocs), inReuse, flags); if (inDocsAndPositions == null) { return null; } // we ignore the fact that offsets may be stored but not asked for, // since this code is expected to be used during addIndexes which will // ask for everything. if that assumption changes in the future, we can // factor in whether 'flags' says offsets are not required. bool storeOffsets = indexOptions.GetValueOrDefault().CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; return new SortingDocsAndPositionsEnum(docMap.Count, wrapReuse, inDocsAndPositions, docMap, storeOffsets); }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { if (!outerInstance.HasPositions) { return(null); } DecodeMetaData(); return(outerInstance.outerInstance.postingsReader.DocsAndPositions(outerInstance.fieldInfo, state, liveDocs, reuse, flags)); }
internal virtual void AddPositions(DocsAndPositionsEnum @in, IndexOutput @out) { int freq = @in.Freq(); @out.WriteVInt(freq); int previousPosition = 0; int previousEndOffset = 0; for (int i = 0; i < freq; i++) { int pos = @in.NextPosition(); BytesRef payload = @in.Payload; // The low-order bit of token is set only if there is a payload, the // previous bits are the delta-encoded position. int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); @out.WriteVInt(token); previousPosition = pos; if (storeOffsets) // don't encode offsets if they are not stored { int startOffset = @in.StartOffset(); int endOffset = @in.EndOffset(); @out.WriteVInt(startOffset - previousEndOffset); @out.WriteVInt(endOffset - startOffset); previousEndOffset = endOffset; } if (payload != null) { @out.WriteVInt(payload.Length); @out.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } } }
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { Debug.Assert(outerInstance.terms.Count > 0); AtomicReader reader = context.AtomicReader; IBits liveDocs = acceptDocs; PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[outerInstance.terms.Count]; Terms fieldTerms = reader.GetTerms(outerInstance.field); if (fieldTerms == null) { return(null); } // Reuse single TermsEnum below: TermsEnum te = fieldTerms.GetIterator(null); for (int i = 0; i < outerInstance.terms.Count; i++) { Term t = outerInstance.terms[i]; TermState state = states[i].Get(context.Ord); if (state == null) // term doesnt exist in this segment { Debug.Assert(TermNotInReader(reader, t), "no termstate found but term exists in reader"); return(null); } te.SeekExact(t.Bytes, state); DocsAndPositionsEnum postingsEnum = te.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE); // PhraseQuery on a field that did not index // positions. if (postingsEnum == null) { Debug.Assert(te.SeekExact(t.Bytes), "termstate found but no term exists in reader"); // term does exist, but has no positions throw new InvalidOperationException("field \"" + t.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.Text() + ")"); } postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.DocFreq, (int)outerInstance.positions[i], t); } // sort by increasing docFreq order if (outerInstance.slop == 0) { ArrayUtil.TimSort(postingsFreqs); } if (outerInstance.slop == 0) // optimize exact case { ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context)); if (s.noDocs) { return(null); } else { return(s); } } else { return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context))); } }
public override DocsAndPositionsEnum DocsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) { if (OuterInstance.fieldInfo.FieldIndexOptions < FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { // Positions were not indexed: return null; } Debug.Assert(!Eof); CurrentFrame.DecodeMetaData(); return OuterInstance.OuterInstance.PostingsReader.DocsAndPositions(OuterInstance.fieldInfo, CurrentFrame.State, skipDocs, reuse, flags); }
/// <summary> /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This /// can be used to feed the highlighter with a pre-parsed token /// stream. The <see cref="Terms"/> must have offsets available. /// <para/> /// In my tests the speeds to recreate 1000 token streams using this method are: /// <list type="bullet"> /// <item><description> /// with TermVector offset only data stored - 420 milliseconds /// </description></item> /// <item><description> /// with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// </description></item> /// <item><description> /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// </description></item> /// </list> /// /// The re-analyze timings will typically vary depending on - /// <list type="number"> /// <item><description> /// The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// </description></item> /// <item><description> /// The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// </description></item> /// <item><description> /// Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </description></item> /// </list> /// </summary> /// <param name="tpv"></param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> /// <exception cref="ArgumentException">if no offsets are available</exception> public static TokenStream GetTokenStream(Terms tpv, bool tokenPositionsGuaranteedContiguous) { if (!tpv.HasOffsets) { throw new ArgumentException("Cannot create TokenStream from Terms without offsets"); } if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions) { return(new TokenStreamFromTermPositionVector(tpv)); } bool hasPayloads = tpv.HasPayloads; // code to reconstruct the original sequence of Tokens TermsEnum termsEnum = tpv.GetEnumerator(); int totalTokens = 0; while (termsEnum.MoveNext()) { totalTokens += (int)termsEnum.TotalTermFreq; } Token[] tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; termsEnum = tpv.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; while (termsEnum.MoveNext()) { dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { throw new ArgumentException("Required TermVector Offset information was not found"); } string term = termsEnum.Term.Utf8ToString(); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { throw new ArgumentException("Required TermVector Offset information was not found"); } Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset); if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } if (tokenPositionsGuaranteedContiguous && pos != -1) { // We have positions stored and a guarantee that the token position // information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 // token in same position or // creates jumps in position numbers - this code would fail under those // circumstances // tokens stored with positions - can use this to index straight into // sorted array tokensInOriginalOrder[pos] = token; } else { // tokens NOT stored with positions or not guaranteed contiguous - must // add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } unsortedTokens.Add(token); } } } // If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer()); //tokensInOriginalOrder = tokensInOriginalOrder // .OrderBy(t => t, new TokenComparer() ) // .ToArray(); } return(new StoredTokenStream(tokensInOriginalOrder)); }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { var postings = _current.Value; if (postings.POSITIONS == null && postings.START_OFFSETS == null) return null; // TODO: reuse var e = new SimpleTVDocsAndPositionsEnum(); e.Reset(liveDocs, postings.POSITIONS, postings.START_OFFSETS, postings.END_OFFSETS, postings.PAYLOADS); return e; }
/// <summary> /// Safe (but, slowish) default method to write every /// vector field in the document. /// </summary> protected void AddAllDocVectors(Fields vectors, MergeState mergeState) { if (vectors == null) { StartDocument(0); FinishDocument(); return; } int numFields = vectors.Count; if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();) foreach (string it in vectors) { numFields++; } } StartDocument(numFields); string lastFieldName = null; TermsEnum termsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; int fieldCount = 0; foreach (string fieldName in vectors) { fieldCount++; FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName); if (Debugging.AssertsEnabled) { Debugging.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, () => "lastFieldName=" + lastFieldName + " fieldName=" + fieldName); } lastFieldName = fieldName; Terms terms = vectors.GetTerms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } bool hasPositions = terms.HasPositions; bool hasOffsets = terms.HasOffsets; bool hasPayloads = terms.HasPayloads; if (Debugging.AssertsEnabled) { Debugging.Assert(!hasPayloads || hasPositions); } int numTerms = (int)terms.Count; if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.GetEnumerator(termsEnum); while (termsEnum.MoveNext()) { numTerms++; } } StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.GetEnumerator(termsEnum); int termCount = 0; while (termsEnum.MoveNext()) { termCount++; int freq = (int)termsEnum.TotalTermFreq; StartTerm(termsEnum.Term, freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); if (Debugging.AssertsEnabled) { Debugging.Assert(docsAndPositionsEnum != null); } int docID = docsAndPositionsEnum.NextDoc(); if (Debugging.AssertsEnabled) { Debugging.Assert(docID != DocIdSetIterator.NO_MORE_DOCS); Debugging.Assert(docsAndPositionsEnum.Freq == freq); } for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = docsAndPositionsEnum.NextPosition(); int startOffset = docsAndPositionsEnum.StartOffset; int endOffset = docsAndPositionsEnum.EndOffset; BytesRef payload = docsAndPositionsEnum.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(!hasPositions || pos >= 0); } AddPosition(pos, startOffset, endOffset, payload); } } FinishTerm(); } if (Debugging.AssertsEnabled) { Debugging.Assert(termCount == numTerms); } FinishField(); } if (Debugging.AssertsEnabled) { Debugging.Assert(fieldCount == numFields); } FinishDocument(); }
public TermSpans(DocsAndPositionsEnum postings, Term term) { this.Postings_Renamed = postings; this.Term = term; Doc_Renamed = -1; }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { throw UnsupportedOperationException.Create(); }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { return Delegate().DocsAndPositions(liveDocs, reuse, flags); }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { return(actualEnum.DocsAndPositions(liveDocs, reuse, flags)); }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { throw new NotSupportedException(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public org.apache.lucene.index.DocsAndPositionsEnum docsAndPositions(org.apache.lucene.util.Bits liveDocs, org.apache.lucene.index.DocsAndPositionsEnum reuse, final int flags) throws java.io.IOException //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: public override DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.DocsAndPositionsEnum inReuse; DocsAndPositionsEnum inReuse; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final SortingDocsAndPositionsEnum wrapReuse; SortingDocsAndPositionsEnum wrapReuse; if (reuse != null && reuse is SortingDocsAndPositionsEnum) { // if we're asked to reuse the given DocsEnum and it is Sorting, return // the wrapped one, since some Codecs expect it. wrapReuse = (SortingDocsAndPositionsEnum) reuse; inReuse = wrapReuse.Wrapped; } else { wrapReuse = null; inReuse = reuse; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.DocsAndPositionsEnum inDocsAndPositions = in.docsAndPositions(newToOld(liveDocs), inReuse, flags); DocsAndPositionsEnum inDocsAndPositions = @in.docsAndPositions(newToOld(liveDocs), inReuse, flags); if (inDocsAndPositions == null) { return null; } // we ignore the fact that offsets may be stored but not asked for, // since this code is expected to be used during addIndexes which will // ask for everything. if that assumption changes in the future, we can // factor in whether 'flags' says offsets are not required. //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean storeOffsets = indexOptions.compareTo(org.apache.lucene.index.FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; bool storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; return new SortingDocsAndPositionsEnum(docMap.size(), wrapReuse, inDocsAndPositions, docMap, storeOffsets); }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { return(Delegate().DocsAndPositions(liveDocs, reuse, flags)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void addPositions(final org.apache.lucene.index.DocsAndPositionsEnum in, final org.apache.lucene.store.IndexOutput out) throws java.io.IOException //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: internal virtual void addPositions(DocsAndPositionsEnum @in, IndexOutput @out) { int freq = @in.freq(); @out.writeVInt(freq); int previousPosition = 0; int previousEndOffset = 0; for (int i = 0; i < freq; i++) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int pos = in.nextPosition(); int pos = @in.nextPosition(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.BytesRef payload = in.getPayload(); BytesRef payload = @in.Payload; // The low-order bit of token is set only if there is a payload, the // previous bits are the delta-encoded position. //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); @out.writeVInt(token); previousPosition = pos; if (storeOffsets) // don't encode offsets if they are not stored { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int startOffset = in.startOffset(); int startOffset = @in.startOffset(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int endOffset = in.endOffset(); int endOffset = @in.endOffset(); @out.writeVInt(startOffset - previousEndOffset); @out.writeVInt(endOffset - startOffset); previousEndOffset = endOffset; } if (payload != null) { @out.writeVInt(payload.length); @out.writeBytes(payload.bytes, payload.offset, payload.length); } } }
private void DuellReaders(CompositeReader other, AtomicReader memIndexReader) { AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other); Fields memFields = memIndexReader.Fields; foreach (string field in competitor.Fields) { Terms memTerms = memFields.GetTerms(field); Terms iwTerms = memIndexReader.GetTerms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = competitor.GetNormValues(field); NumericDocValues memNormValues = memIndexReader.GetNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.Get(0), memNormValues.Get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.DocCount, memTerms.DocCount); assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq); assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq); TermsEnum iwTermsIter = iwTerms.GetIterator(null); TermsEnum memTermsIter = memTerms.GetIterator(null); if (iwTerms.HasPositions) { bool offsets = iwTerms.HasOffsets && memTerms.HasOffsets; while (iwTermsIter.Next() != null) { assertNotNull(memTermsIter.Next()); assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsAndPositionsEnum iwDocsAndPos = iwTermsIter.DocsAndPositions(null, null); DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); for (int i = 0; i < iwDocsAndPos.Freq; i++) { assertEquals("term: " + iwTermsIter.Term.Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition()); if (offsets) { assertEquals(iwDocsAndPos.StartOffset, memDocsAndPos.StartOffset); assertEquals(iwDocsAndPos.EndOffset, memDocsAndPos.EndOffset); } } } } } else { while (iwTermsIter.Next() != null) { assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsEnum iwDocsAndPos = iwTermsIter.Docs(null, null); DocsEnum memDocsAndPos = memTermsIter.Docs(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); } } } } } }
public override DocsAndPositionsEnum DocsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { bool hasOffsets = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; // TODO: can we optimize if FLAG_PAYLOADS / FLAG_OFFSETS // isn't passed? // TODO: refactor if (fieldInfo.HasPayloads() || hasOffsets) { SegmentFullPositionsEnum docsEnum; if (reuse == null || !(reuse is SegmentFullPositionsEnum)) { docsEnum = new SegmentFullPositionsEnum(this, FreqIn, ProxIn); } else { docsEnum = (SegmentFullPositionsEnum)reuse; if (docsEnum.StartFreqIn != FreqIn) { // If you are using ParellelReader, and pass in a // reused DocsEnum, it could have come from another // reader also using standard codec docsEnum = new SegmentFullPositionsEnum(this, FreqIn, ProxIn); } } return docsEnum.Reset(fieldInfo, (StandardTermState)termState, liveDocs); } else { SegmentDocsAndPositionsEnum docsEnum; if (reuse == null || !(reuse is SegmentDocsAndPositionsEnum)) { docsEnum = new SegmentDocsAndPositionsEnum(this, FreqIn, ProxIn); } else { docsEnum = (SegmentDocsAndPositionsEnum)reuse; if (docsEnum.StartFreqIn != FreqIn) { // If you are using ParellelReader, and pass in a // reused DocsEnum, it could have come from another // reader also using standard codec docsEnum = new SegmentDocsAndPositionsEnum(this, FreqIn, ProxIn); } } return docsEnum.Reset(fieldInfo, (StandardTermState)termState, liveDocs); } }
public virtual void TestPayloadsPos0() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, new MockPayloadAnalyzer()); Document doc = new Document(); doc.Add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k"))); writer.AddDocument(doc); IndexReader readerFromWriter = writer.Reader; AtomicReader r = SlowCompositeReaderWrapper.Wrap(readerFromWriter); DocsAndPositionsEnum tp = r.TermPositionsEnum(new Term("content", "a")); int count = 0; Assert.IsTrue(tp.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); // "a" occurs 4 times Assert.AreEqual(4, tp.Freq()); Assert.AreEqual(0, tp.NextPosition()); Assert.AreEqual(1, tp.NextPosition()); Assert.AreEqual(3, tp.NextPosition()); Assert.AreEqual(6, tp.NextPosition()); // only one doc has "a" Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, tp.NextDoc()); IndexSearcher @is = NewSearcher(readerFromWriter); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = new SpanQuery[] { stq1, stq2 }; SpanNearQuery snq = new SpanNearQuery(sqs, 30, false); count = 0; bool sawZero = false; if (VERBOSE) { Console.WriteLine("\ngetPayloadSpans test"); } Search.Spans.Spans pspans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); while (pspans.Next()) { if (VERBOSE) { Console.WriteLine("doc " + pspans.Doc() + ": span " + pspans.Start() + " to " + pspans.End()); } ICollection <sbyte[]> payloads = pspans.Payload; sawZero |= pspans.Start() == 0; foreach (sbyte[] bytes in payloads) { count++; if (VERBOSE) { Console.WriteLine(" payload: " + Encoding.UTF8.GetString((byte[])(Array)bytes)); } } } Assert.IsTrue(sawZero); Assert.AreEqual(5, count); // System.out.println("\ngetSpans test"); Search.Spans.Spans spans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); count = 0; sawZero = false; while (spans.Next()) { count++; sawZero |= spans.Start() == 0; // System.out.println(spans.Doc() + " - " + spans.Start() + " - " + // spans.End()); } Assert.AreEqual(4, count); Assert.IsTrue(sawZero); // System.out.println("\nPayloadSpanUtil test"); sawZero = false; PayloadSpanUtil psu = new PayloadSpanUtil(@is.TopReaderContext); ICollection <sbyte[]> pls = psu.GetPayloadsForQuery(snq); count = pls.Count; foreach (sbyte[] bytes in pls) { string s = Encoding.UTF8.GetString((byte[])(Array)bytes); //System.out.println(s); sawZero |= s.Equals("pos: 0"); } Assert.AreEqual(5, count); Assert.IsTrue(sawZero); writer.Dispose(); @is.IndexReader.Dispose(); dir.Dispose(); }
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { return ActualEnum.DocsAndPositions(liveDocs, reuse, flags); }
public override DocsAndPositionsEnum DocsAndPositions(FieldInfo fieldInfo, BlockTermState bTermState, Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { Debug.Assert(fieldInfo.FieldIndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); var termState = (SepTermState)bTermState; SepDocsAndPositionsEnum postingsEnum; if (!(reuse is SepDocsAndPositionsEnum)) { postingsEnum = new SepDocsAndPositionsEnum(this); } else { postingsEnum = (SepDocsAndPositionsEnum) reuse; if (postingsEnum.START_DOC_IN != _docIn) { // If you are using ParellelReader, and pass in a // reused DocsAndPositionsEnum, it could have come // from another reader also using sep codec postingsEnum = new SepDocsAndPositionsEnum(this); } } return postingsEnum.Init(fieldInfo, termState, liveDocs); }