// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer == null) { throw new NullReferenceException("PassageScorer cannot be null"); } Support.PriorityQueue <OffsetsEnum> pq = new Support.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de == null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination Support.PriorityQueue <Passage> passageQueue = new Support.PriorityQueue <Passage>(n, new HighlightDocComparerAnonymousHelper1()); Passage current = new Passage(); OffsetsEnum off; while ((off = pq.Poll()) != null) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. Debug.Assert(EMPTY.StartOffset == int.MaxValue); if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Offer(current); if (passageQueue.Count > n) { current = passageQueue.Poll(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, new HighlightDocComparerAnonymousHelper2()); return(passages); } // advance breakiterator Debug.Assert(BreakIterator.DONE < 0); current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); Debug.Assert(term != null); } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Offer(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: Debug.Assert(false); return(null); }
/// <summary> /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This /// can be used to feed the highlighter with a pre-parsed token /// stream. The <see cref="Terms"/> must have offsets available. /// <para/> /// In my tests the speeds to recreate 1000 token streams using this method are: /// <list type="bullet"> /// <item><description> /// with TermVector offset only data stored - 420 milliseconds /// </description></item> /// <item><description> /// with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// </description></item> /// <item><description> /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// </description></item> /// </list> /// /// The re-analyze timings will typically vary depending on - /// <list type="number"> /// <item><description> /// The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// </description></item> /// <item><description> /// The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// </description></item> /// <item><description> /// Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </description></item> /// </list> /// </summary> /// <param name="tpv"></param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> /// <exception cref="ArgumentException">if no offsets are available</exception> public static TokenStream GetTokenStream(Terms tpv, bool tokenPositionsGuaranteedContiguous) { if (!tpv.HasOffsets) { throw new ArgumentException("Cannot create TokenStream from Terms without offsets"); } if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions) { return(new TokenStreamFromTermPositionVector(tpv)); } bool hasPayloads = tpv.HasPayloads; // code to reconstruct the original sequence of Tokens TermsEnum termsEnum = tpv.GetIterator(null); int totalTokens = 0; while (termsEnum.Next() != null) { totalTokens += (int)termsEnum.TotalTermFreq; } Token[] tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; termsEnum = tpv.GetIterator(null); BytesRef text; DocsAndPositionsEnum dpEnum = null; while ((text = termsEnum.Next()) != null) { dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { throw new ArgumentException("Required TermVector Offset information was not found"); } string term = text.Utf8ToString(); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { throw new ArgumentException("Required TermVector Offset information was not found"); } Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset); if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } if (tokenPositionsGuaranteedContiguous && pos != -1) { // We have positions stored and a guarantee that the token position // information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 // token in same position or // creates jumps in position numbers - this code would fail under those // circumstances // tokens stored with positions - can use this to index straight into // sorted array tokensInOriginalOrder[pos] = token; } else { // tokens NOT stored with positions or not guaranteed contiguous - must // add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } unsortedTokens.Add(token); } } } // If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer()); //tokensInOriginalOrder = tokensInOriginalOrder // .OrderBy(t => t, new TokenComparer() ) // .ToArray(); } return(new StoredTokenStream(tokensInOriginalOrder)); }
/// <summary> /// Safe (but, slowish) default method to write every /// vector field in the document. /// </summary> protected void AddAllDocVectors(Fields vectors, MergeState mergeState) { if (vectors == null) { StartDocument(0); FinishDocument(); return; } int numFields = vectors.Count; if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();) foreach (string it in vectors) { numFields++; } } StartDocument(numFields); string lastFieldName = null; TermsEnum termsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; int fieldCount = 0; foreach (string fieldName in vectors) { fieldCount++; FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName); Debug.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, "lastFieldName=" + lastFieldName + " fieldName=" + fieldName); lastFieldName = fieldName; Terms terms = vectors.GetTerms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } bool hasPositions = terms.HasPositions; bool hasOffsets = terms.HasOffsets; bool hasPayloads = terms.HasPayloads; Debug.Assert(!hasPayloads || hasPositions); int numTerms = (int)terms.Count; if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.GetIterator(termsEnum); while (termsEnum.Next() != null) { numTerms++; } } StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.GetIterator(termsEnum); int termCount = 0; while (termsEnum.Next() != null) { termCount++; int freq = (int)termsEnum.TotalTermFreq; StartTerm(termsEnum.Term, freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); Debug.Assert(docsAndPositionsEnum != null); int docID = docsAndPositionsEnum.NextDoc(); Debug.Assert(docID != DocIdSetIterator.NO_MORE_DOCS); Debug.Assert(docsAndPositionsEnum.Freq == freq); for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = docsAndPositionsEnum.NextPosition(); int startOffset = docsAndPositionsEnum.StartOffset; int endOffset = docsAndPositionsEnum.EndOffset; BytesRef payload = docsAndPositionsEnum.GetPayload(); Debug.Assert(!hasPositions || pos >= 0); AddPosition(pos, startOffset, endOffset, payload); } } FinishTerm(); } Debug.Assert(termCount == numTerms); FinishField(); } Debug.Assert(fieldCount == numFields); FinishDocument(); }
public virtual void TestPayloadsPos0() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, new MockPayloadAnalyzer()); Document doc = new Document(); doc.Add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k"))); writer.AddDocument(doc); IndexReader readerFromWriter = writer.GetReader(); AtomicReader r = SlowCompositeReaderWrapper.Wrap(readerFromWriter); DocsAndPositionsEnum tp = r.GetTermPositionsEnum(new Term("content", "a")); int count = 0; Assert.IsTrue(tp.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); // "a" occurs 4 times Assert.AreEqual(4, tp.Freq); Assert.AreEqual(0, tp.NextPosition()); Assert.AreEqual(1, tp.NextPosition()); Assert.AreEqual(3, tp.NextPosition()); Assert.AreEqual(6, tp.NextPosition()); // only one doc has "a" Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, tp.NextDoc()); IndexSearcher @is = NewSearcher(readerFromWriter); SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); SpanQuery[] sqs = new SpanQuery[] { stq1, stq2 }; SpanNearQuery snq = new SpanNearQuery(sqs, 30, false); count = 0; bool sawZero = false; if (Verbose) { Console.WriteLine("\ngetPayloadSpans test"); } Search.Spans.Spans pspans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); while (pspans.MoveNext()) { if (Verbose) { Console.WriteLine("doc " + pspans.Doc + ": span " + pspans.Start + " to " + pspans.End); } var payloads = pspans.GetPayload(); sawZero |= pspans.Start == 0; foreach (var bytes in payloads) { count++; if (Verbose) { Console.WriteLine(" payload: " + Encoding.UTF8.GetString(bytes)); } } } Assert.IsTrue(sawZero); Assert.AreEqual(5, count); // System.out.println("\ngetSpans test"); Search.Spans.Spans spans = MultiSpansWrapper.Wrap(@is.TopReaderContext, snq); count = 0; sawZero = false; while (spans.MoveNext()) { count++; sawZero |= spans.Start == 0; // System.out.println(spans.Doc() + " - " + spans.Start() + " - " + // spans.End()); } Assert.AreEqual(4, count); Assert.IsTrue(sawZero); // System.out.println("\nPayloadSpanUtil test"); sawZero = false; PayloadSpanUtil psu = new PayloadSpanUtil(@is.TopReaderContext); var pls = psu.GetPayloadsForQuery(snq); count = pls.Count; foreach (var bytes in pls) { string s = Encoding.UTF8.GetString(bytes); //System.out.println(s); sawZero |= s.Equals("pos: 0", StringComparison.Ordinal); } Assert.AreEqual(5, count); Assert.IsTrue(sawZero); writer.Dispose(); @is.IndexReader.Dispose(); dir.Dispose(); }
public virtual void TestSetPosition() { Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this); Directory store = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, store, analyzer); Document d = new Document(); d.Add(NewTextField("field", "bogus", Field.Store.YES)); writer.AddDocument(d); IndexReader reader = writer.GetReader(); writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); DocsAndPositionsEnum pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("1")); pos.NextDoc(); // first token should be at position 0 Assert.AreEqual(0, pos.NextPosition()); pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("2")); pos.NextDoc(); // second token should be at position 2 Assert.AreEqual(2, pos.NextPosition()); PhraseQuery q; ScoreDoc[] hits; q = new PhraseQuery(); q.Add(new Term("field", "1")); q.Add(new Term("field", "2")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // same as previous, just specify positions explicitely. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 1); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // specifying correct positions should find the phrase. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 2); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "3")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // phrase query would find it when correct positions are specified. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "4"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); // phrase query should fail for non existing searched term // even if there exist another searched terms in the same searched position. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "9"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // multi-phrase query should succed for non existing searched term // because there exist another searched terms in the same searched position. MultiPhraseQuery mq = new MultiPhraseQuery(); mq.Add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0); hits = searcher.Search(mq, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "4")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); reader.Dispose(); store.Dispose(); }
///<summary>Constructor</summary> /// <param name="vector"> /// Terms that contains the data for /// creating the <see cref="TokenStream"/>. Must have positions and offsets. /// </param> public TokenStreamFromTermPositionVector(Terms vector) { termAttribute = AddAttribute <ICharTermAttribute>(); positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); payloadAttribute = AddAttribute <IPayloadAttribute>(); bool hasOffsets = vector.HasOffsets; bool hasPayloads = vector.HasPayloads; TermsEnum termsEnum = vector.GetEnumerator(); BytesRef text; DocsAndPositionsEnum dpEnum = null; while (termsEnum.MoveNext()) { text = termsEnum.Term; dpEnum = termsEnum.DocsAndPositions(null, dpEnum); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int j = 0; j < freq; j++) { int pos = dpEnum.NextPosition(); Token token; if (hasOffsets) { token = new Token(text.Utf8ToString(), dpEnum.StartOffset, dpEnum.EndOffset); } else { token = new Token(); token.SetEmpty().Append(text.Utf8ToString()); } if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } // Yes - this is the position, not the increment! This is for // sorting. This value // will be corrected before use. token.PositionIncrement = pos; this.positionedTokens.Add(token); } } CollectionUtil.TimSort(this.positionedTokens, tokenComparer); int lastPosition = -1; foreach (Token token in this.positionedTokens) { int thisPosition = token.PositionIncrement; token.PositionIncrement = thisPosition - lastPosition; lastPosition = thisPosition; } this.tokensAtCurrentPosition = this.positionedTokens.GetEnumerator(); }
public virtual void TestMixedVectrosVectors() { RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE)); Document doc = new Document(); FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.StoreTermVectors = true; FieldType ft3 = new FieldType(TextField.TYPE_STORED); ft3.StoreTermVectors = true; ft3.StoreTermVectorPositions = true; FieldType ft4 = new FieldType(TextField.TYPE_STORED); ft4.StoreTermVectors = true; ft4.StoreTermVectorOffsets = true; FieldType ft5 = new FieldType(TextField.TYPE_STORED); ft5.StoreTermVectors = true; ft5.StoreTermVectorOffsets = true; ft5.StoreTermVectorPositions = true; doc.Add(NewTextField("field", "one", Field.Store.YES)); doc.Add(NewField("field", "one", ft2)); doc.Add(NewField("field", "one", ft3)); doc.Add(NewField("field", "one", ft4)); doc.Add(NewField("field", "one", ft5)); writer.AddDocument(doc); IndexReader reader = writer.Reader; writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc); Assert.IsNotNull(vectors); Assert.AreEqual(1, vectors.Count); Terms vector = vectors.GetTerms("field"); Assert.IsNotNull(vector); Assert.AreEqual(1, vector.Count); TermsEnum termsEnum = vector.GetIterator(null); Assert.IsNotNull(termsEnum.Next()); Assert.AreEqual("one", termsEnum.Term.Utf8ToString()); Assert.AreEqual(5, termsEnum.TotalTermFreq); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, dpEnum.NextPosition()); } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq); for (int i = 0; i < 5; i++) { dpEnum.NextPosition(); Assert.AreEqual(4 * i, dpEnum.StartOffset); Assert.AreEqual(4 * i + 3, dpEnum.EndOffset); } reader.Dispose(); }
//public static void main( string[] args ) throws Exception { // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); // Query query = parser.parse( "a x:b" ); // FieldQuery fieldQuery = new FieldQuery( query, true, false ); // Directory dir = new RAMDirectory(); // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); // Document doc = new Document(); // IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED); // ft.setStoreTermVectors(true); // ft.setStoreTermVectorOffsets(true); // ft.setStoreTermVectorPositions(true); // doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) ); // doc.add( new Field( "f", ft, "b a b a f" ) ); // writer.addDocument( doc ); // writer.close(); // IndexReader reader = IndexReader.open(dir1); // new FieldTermStack( reader, 0, "f", fieldQuery ); // reader.close(); //} /// <summary> /// a constructor. /// </summary> /// <param name="reader"><see cref="IndexReader"/> of the index</param> /// <param name="docId">document id to be highlighted</param> /// <param name="fieldName">field of the document to be highlighted</param> /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; ISet <string> termSet = fieldQuery.GetTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet is null) { return; } Fields vectors = reader.GetTermVectors(docId); if (vectors is null) { // null snippet return; } Terms vector = vectors.GetTerms(fieldName); if (vector is null) { // null snippet return; } CharsRef spare = new CharsRef(); TermsEnum termsEnum = vector.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.MaxDoc; while (termsEnum.MoveNext()) { text = termsEnum.Term; UnicodeUtil.UTF8toUTF16(text, spare); string term = spare.ToString(); if (!termSet.Contains(term)) { continue; } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum is null) { // null snippet return; } dpEnum.NextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0); int freq = dpEnum.Freq; for (int i = 0; i < freq; i++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { return; // no offsets, null snippet } termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight)); } } // sort by position CollectionUtil.TimSort(termList); // now look for dups at the same position, linking them together int currentPos = -1; TermInfo previous = null; TermInfo first = null; for (int i = 0; i < termList.Count;) { TermInfo current = termList[i]; if (current.Position == currentPos) { if (Debugging.AssertsEnabled) { Debugging.Assert(previous != null); } previous.SetNext(current); previous = current; //iterator.Remove(); // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item) termList.RemoveAt(i); } else { if (previous != null) { previous.SetNext(first); } previous = first = current; currentPos = current.Position; // LUCENENET NOTE: Only increment the position if we don't do a delete. i++; } } if (previous != null) { previous.SetNext(first); } }
public virtual void TestWickedLongTerm() { using (RAMDirectory dir = new RAMDirectory()) { char[] chars = new char[IndexWriter.MAX_TERM_LENGTH]; Arrays.Fill(chars, 'x'); string bigTerm = new string(chars); Document doc = new Document(); using (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)))) { // This produces a too-long term: string contents = "abc xyz x" + bigTerm + " another term"; doc.Add(new TextField("content", contents, Field.Store.NO)); writer.AddDocument(doc); // Make sure we can add another normal document doc = new Document(); doc.Add(new TextField("content", "abc bbb ccc", Field.Store.NO)); writer.AddDocument(doc); } #pragma warning disable 612, 618 using (IndexReader reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { // Make sure all terms < max size were indexed assertEquals(2, reader.DocFreq(new Term("content", "abc"))); assertEquals(1, reader.DocFreq(new Term("content", "bbb"))); assertEquals(1, reader.DocFreq(new Term("content", "term"))); assertEquals(1, reader.DocFreq(new Term("content", "another"))); // Make sure position is still incremented when // massive term is skipped: DocsAndPositionsEnum tps = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "content", new BytesRef("another")); assertTrue(tps.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, tps.Freq()); assertEquals(3, tps.NextPosition()); // Make sure the doc that has the massive term is in // the index: assertEquals("document with wicked long term should is not in the index!", 2, reader.NumDocs); } // Make sure we can add a document with exactly the // maximum length term, and search on that term: doc = new Document(); doc.Add(new TextField("content", bigTerm, Field.Store.NO)); ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); sa.MaxTokenLength = 100000; using (var writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa))) { writer.AddDocument(doc); } #pragma warning disable 612, 618 using (var reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { assertEquals(1, reader.DocFreq(new Term("content", bigTerm))); } } }
private void DuellReaders(CompositeReader other, AtomicReader memIndexReader) { AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other); Fields memFields = memIndexReader.Fields; foreach (string field in competitor.Fields) { Terms memTerms = memFields.GetTerms(field); Terms iwTerms = memIndexReader.GetTerms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = competitor.GetNormValues(field); NumericDocValues memNormValues = memIndexReader.GetNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.Get(0), memNormValues.Get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.DocCount, memTerms.DocCount); assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq); assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq); TermsEnum iwTermsIter = iwTerms.GetIterator(null); TermsEnum memTermsIter = memTerms.GetIterator(null); if (iwTerms.HasPositions) { bool offsets = iwTerms.HasOffsets && memTerms.HasOffsets; while (iwTermsIter.Next() != null) { assertNotNull(memTermsIter.Next()); assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsAndPositionsEnum iwDocsAndPos = iwTermsIter.DocsAndPositions(null, null); DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); for (int i = 0; i < iwDocsAndPos.Freq; i++) { assertEquals("term: " + iwTermsIter.Term.Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition()); if (offsets) { assertEquals(iwDocsAndPos.StartOffset, memDocsAndPos.StartOffset); assertEquals(iwDocsAndPos.EndOffset, memDocsAndPos.EndOffset); } } } } } else { while (iwTermsIter.Next() != null) { assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsEnum iwDocsAndPos = iwTermsIter.Docs(null, null); DocsEnum memDocsAndPos = memTermsIter.Docs(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); } } } } } }
public override int NextPosition() { return(current.NextPosition()); }
internal virtual void AddPositions(DocsAndPositionsEnum @in, IndexOutput @out) { int freq = @in.Freq(); @out.WriteVInt(freq); int previousPosition = 0; int previousEndOffset = 0; for (int i = 0; i < freq; i++) { int pos = @in.NextPosition(); BytesRef payload = @in.Payload; // The low-order bit of token is set only if there is a payload, the // previous bits are the delta-encoded position. int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); @out.WriteVInt(token); previousPosition = pos; if (storeOffsets) // don't encode offsets if they are not stored { int startOffset = @in.StartOffset(); int endOffset = @in.EndOffset(); @out.WriteVInt(startOffset - previousEndOffset); @out.WriteVInt(endOffset - startOffset); previousEndOffset = endOffset; } if (payload != null) { @out.WriteVInt(payload.Length); @out.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } } }