public void TestKeepsLastFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.KeepMode = (KeepMode.KM_USE_LAST_OCCURRENCE); ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs; assertTrue("Filtered searching should have found some matches", hits.Length > 0); foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); DocsEnum td = TestUtil.Docs(Random(), reader, KEY_FIELD, new BytesRef(url), MultiFields.GetLiveDocs(reader), null, 0); int lastDoc = 0; while (td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { lastDoc = td.DocID; } assertEquals("Duplicate urls should return last doc", lastDoc, hit.Doc); } }
public virtual void TestWickedLongTerm() { using RAMDirectory dir = new RAMDirectory(); char[] chars = new char[IndexWriter.MAX_TERM_LENGTH]; Arrays.Fill(chars, 'x'); string bigTerm = new string(chars); Document doc = new Document(); using (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)))) { // This produces a too-long term: string contents = "abc xyz x" + bigTerm + " another term"; doc.Add(new TextField("content", contents, Field.Store.NO)); writer.AddDocument(doc); // Make sure we can add another normal document doc = new Document(); doc.Add(new TextField("content", "abc bbb ccc", Field.Store.NO)); writer.AddDocument(doc); } #pragma warning disable 612, 618 using (IndexReader reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { // Make sure all terms < max size were indexed assertEquals(2, reader.DocFreq(new Term("content", "abc"))); assertEquals(1, reader.DocFreq(new Term("content", "bbb"))); assertEquals(1, reader.DocFreq(new Term("content", "term"))); assertEquals(1, reader.DocFreq(new Term("content", "another"))); // Make sure position is still incremented when // massive term is skipped: DocsAndPositionsEnum tps = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "content", new BytesRef("another")); assertTrue(tps.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, tps.Freq); assertEquals(3, tps.NextPosition()); // Make sure the doc that has the massive term is in // the index: assertEquals("document with wicked long term should is not in the index!", 2, reader.NumDocs); } // Make sure we can add a document with exactly the // maximum length term, and search on that term: doc = new Document(); doc.Add(new TextField("content", bigTerm, Field.Store.NO)); ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); sa.MaxTokenLength = 100000; using (var writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa))) { writer.AddDocument(doc); } #pragma warning disable 612, 618 using (var reader = IndexReader.Open(dir)) #pragma warning restore 612, 618 { assertEquals(1, reader.DocFreq(new Term("content", bigTerm))); } }
private void VerifyTestObjects(int count, int maxDeletedExclusive) { Assert.AreEqual(count - maxDeletedExclusive, writer.NumDocs); if (maxDeletedExclusive == 0) { using (IndexReader reader = DirectoryReader.Open(dir)) { IBits liveDocs = MultiFields.GetLiveDocs(reader); if (liveDocs == null) { //If no records are deleted, liveDocs will be null Assert.AreEqual(0, maxDeletedExclusive); return; } for (int i = 0; i < count; i++) { if (i < maxDeletedExclusive) { Assert.True(!liveDocs.Get(i)); } else { Assert.False(!liveDocs.Get(i)); Document doc = reader.Document(i); TestObject obj = doc.ToObject <TestObject>(); Assert.AreEqual(i, obj.Number); Assert.AreEqual(String.Format("Test Object {0}", i), obj.String); } } } } }
protected internal ValueSourceScorer(IndexReader reader, FunctionValues values) : base(null) { this.reader = reader; this.maxDoc = reader.MaxDoc; this.values = values; CheckDeletes = true; this.liveDocs = MultiFields.GetLiveDocs(reader); }
/// <summary> /// Creates an iterator over term, weight and payload fields from the lucene /// index. setting <see cref="HasPayloads"/> to false, implies an iterator /// over only term and weight. /// </summary> public DocumentInputIterator(DocumentDictionary outerInstance, bool hasPayloads, bool hasContexts) { this.outerInstance = outerInstance; this.hasPayloads = hasPayloads; this.hasContexts = hasContexts; docCount = outerInstance.reader.MaxDoc - 1; weightValues = (outerInstance.weightField != null) ? MultiDocValues.GetNumericValues(outerInstance.reader, outerInstance.weightField) : null; liveDocs = (outerInstance.reader.Leaves.Count > 0) ? MultiFields.GetLiveDocs(outerInstance.reader) : null; relevantFields = GetRelevantFields(new string[] { outerInstance.field, outerInstance.weightField, outerInstance.payloadField, outerInstance.contextsField }); }
private FixedBitSet CreateExpectedResult(string queryValue, bool from, IndexReader topLevelReader, IndexIterationContext context) { IDictionary <string, IList <RandomDoc> > randomValueDocs; IDictionary <string, IList <RandomDoc> > linkValueDocuments; if (from) { randomValueDocs = context.RandomValueFromDocs; linkValueDocuments = context.ToDocuments; } else { randomValueDocs = context.RandomValueToDocs; linkValueDocuments = context.FromDocuments; } FixedBitSet expectedResult = new FixedBitSet(topLevelReader.MaxDoc); if (!randomValueDocs.TryGetValue(queryValue, out IList <RandomDoc> matchingDocs) || matchingDocs == null) { return(new FixedBitSet(topLevelReader.MaxDoc)); } foreach (RandomDoc matchingDoc in matchingDocs) { foreach (string linkValue in matchingDoc.linkValues) { if (!linkValueDocuments.TryGetValue(linkValue, out IList <RandomDoc> otherMatchingDocs) || otherMatchingDocs == null) { continue; } foreach (RandomDoc otherSideDoc in otherMatchingDocs) { DocsEnum docsEnum = MultiFields.GetTermDocsEnum(topLevelReader, MultiFields.GetLiveDocs(topLevelReader), "id", new BytesRef(otherSideDoc.id), 0); if (Debugging.AssertsEnabled) { Debugging.Assert(docsEnum != null); } int doc = docsEnum.NextDoc(); expectedResult.Set(doc); } } } return(expectedResult); }
public virtual void TestCaching() { Directory dir = new RAMDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); Document doc = new Document(); TokenStream stream = new TokenStreamAnonymousInnerClassHelper(this); stream = new CachingTokenFilter(stream); doc.Add(new TextField("preanalyzed", stream)); // 1) we consume all tokens twice before we add the doc to the index CheckTokens(stream); stream.Reset(); CheckTokens(stream); // 2) now add the document to the index and verify if all tokens are indexed // don't reset the stream here, the DocumentWriter should do that implicitly writer.AddDocument(doc); IndexReader reader = writer.GetReader(); DocsAndPositionsEnum termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term1")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(0, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term2")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(2, termPositions.Freq); Assert.AreEqual(1, termPositions.NextPosition()); Assert.AreEqual(3, termPositions.NextPosition()); termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term3")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(2, termPositions.NextPosition()); reader.Dispose(); writer.Dispose(); // 3) reset stream and consume tokens again stream.Reset(); CheckTokens(stream); dir.Dispose(); }
public virtual void TestMutipleDocument() { RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); Document doc = new Document(); doc.Add(new TextField("partnum", "Q36", Field.Store.YES)); writer.AddDocument(doc); doc = new Document(); doc.Add(new TextField("partnum", "Q37", Field.Store.YES)); writer.AddDocument(doc); writer.Dispose(); IndexReader reader = DirectoryReader.Open(dir); DocsEnum td = TestUtil.Docs(Random(), reader, "partnum", new BytesRef("Q36"), MultiFields.GetLiveDocs(reader), null, 0); assertTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); td = TestUtil.Docs(Random(), reader, "partnum", new BytesRef("Q37"), MultiFields.GetLiveDocs(reader), null, 0); assertTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); }
public virtual void TestSetPosition() { Analyzer analyzer = new AnalyzerAnonymousClass(this); Directory store = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, store, analyzer); Document d = new Document(); d.Add(NewTextField("field", "bogus", Field.Store.YES)); writer.AddDocument(d); IndexReader reader = writer.GetReader(); writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); DocsAndPositionsEnum pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("1")); pos.NextDoc(); // first token should be at position 0 Assert.AreEqual(0, pos.NextPosition()); pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("2")); pos.NextDoc(); // second token should be at position 2 Assert.AreEqual(2, pos.NextPosition()); PhraseQuery q; ScoreDoc[] hits; q = new PhraseQuery(); q.Add(new Term("field", "1")); q.Add(new Term("field", "2")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // same as previous, just specify positions explicitely. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 1); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // specifying correct positions should find the phrase. q = new PhraseQuery(); q.Add(new Term("field", "1"), 0); q.Add(new Term("field", "2"), 2); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "3")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // phrase query would find it when correct positions are specified. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "4"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); // phrase query should fail for non existing searched term // even if there exist another searched terms in the same searched position. q = new PhraseQuery(); q.Add(new Term("field", "3"), 0); q.Add(new Term("field", "9"), 0); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // multi-phrase query should succed for non existing searched term // because there exist another searched terms in the same searched position. MultiPhraseQuery mq = new MultiPhraseQuery(); mq.Add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0); hits = searcher.Search(mq, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "4")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "3")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "4")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); q = new PhraseQuery(); q.Add(new Term("field", "2")); q.Add(new Term("field", "5")); hits = searcher.Search(q, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); reader.Dispose(); store.Dispose(); }
/// <summary> /// Creates an iterator over term, weight and payload fields from the lucene /// index. Setting <paramref name="hasPayloads"/> to <c>false</c>, implies an enumerator /// over only term and weight. /// </summary> public DocumentInputEnumerator(DocumentDictionary documentDictionary, bool hasPayloads, bool hasContexts) { this.outerInstance = documentDictionary; this.hasPayloads = hasPayloads; this.hasContexts = hasContexts; docCount = documentDictionary.m_reader.MaxDoc - 1; weightValues = (documentDictionary.weightField != null) ? MultiDocValues.GetNumericValues(documentDictionary.m_reader, documentDictionary.weightField) : null; liveDocs = (documentDictionary.m_reader.Leaves.Count > 0) ? MultiFields.GetLiveDocs(documentDictionary.m_reader) : null; relevantFields = GetRelevantFields(new string[] { documentDictionary.field, documentDictionary.weightField, documentDictionary.m_payloadField, documentDictionary.m_contextsField }); }
public override int DoLogic() { int res = 0; // open reader or use existing one IndexSearcher searcher = RunData.GetIndexSearcher(); IndexReader reader; bool closeSearcher; if (searcher == null) { // open our own reader Directory dir = RunData.Directory; reader = DirectoryReader.Open(dir); searcher = new IndexSearcher(reader); closeSearcher = true; } else { // use existing one; this passes +1 ref to us reader = searcher.IndexReader; closeSearcher = false; } // optionally warm and add num docs traversed to count if (WithWarm) { Document doc = null; IBits liveDocs = MultiFields.GetLiveDocs(reader); for (int m = 0; m < reader.MaxDoc; m++) { if (null == liveDocs || liveDocs.Get(m)) { doc = reader.Document(m); res += (doc == null ? 0 : 1); } } } if (WithSearch) { res++; Query q = queryMaker.MakeQuery(); Sort sort = Sort; TopDocs hits = null; int numHits = NumHits; if (numHits > 0) { if (WithCollector == false) { if (sort != null) { // TODO: instead of always passing false we // should detect based on the query; if we make // the IndexSearcher search methods that take // Weight public again, we can go back to // pulling the Weight ourselves: TopFieldCollector collector = TopFieldCollector.Create(sort, numHits, true, WithScore, WithMaxScore, false); searcher.Search(q, null, collector); hits = collector.GetTopDocs(); } else { hits = searcher.Search(q, numHits); } } else { ICollector collector = CreateCollector(); searcher.Search(q, null, collector); //hits = collector.topDocs(); } string printHitsField = RunData.Config.Get("print.hits.field", null); if (hits != null && printHitsField != null && printHitsField.Length > 0) { Console.WriteLine("totalHits = " + hits.TotalHits); Console.WriteLine("maxDoc() = " + reader.MaxDoc); Console.WriteLine("numDocs() = " + reader.NumDocs); for (int i = 0; i < hits.ScoreDocs.Length; i++) { int docID = hits.ScoreDocs[i].Doc; Document doc = reader.Document(docID); Console.WriteLine(" " + i + ": doc=" + docID + " score=" + hits.ScoreDocs[i].Score + " " + printHitsField + " =" + doc.Get(printHitsField)); } } if (WithTraverse) { ScoreDoc[] scoreDocs = hits.ScoreDocs; int traversalSize = Math.Min(scoreDocs.Length, TraversalSize); if (traversalSize > 0) { bool retrieve = WithRetrieve; int numHighlight = Math.Min(NumToHighlight, scoreDocs.Length); Analyzer analyzer = RunData.Analyzer; BenchmarkHighlighter highlighter = null; if (numHighlight > 0) { highlighter = GetBenchmarkHighlighter(q); } for (int m = 0; m < traversalSize; m++) { int id = scoreDocs[m].Doc; res++; if (retrieve) { Document document = RetrieveDoc(reader, id); res += document != null ? 1 : 0; if (numHighlight > 0 && m < numHighlight) { ICollection <string> fieldsToHighlight = GetFieldsToHighlight(document); foreach (string field in fieldsToHighlight) { string text = document.Get(field); res += highlighter.DoHighlight(reader, id, field, document, analyzer, text); } } } } } } } } if (closeSearcher) { reader.Dispose(); } else { // Release our +1 ref from above reader.DecRef(); } return(res); }