public virtual void TestTokenReuse() { Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this); IndexWriter writer = new IndexWriter(Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); doc.Add(NewTextField("f1", "a 5 a a", Field.Store.YES)); writer.AddDocument(doc); writer.Commit(); SegmentCommitInfo info = writer.NewestSegment(); writer.Dispose(); SegmentReader reader = new SegmentReader(info, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, NewIOContext(Random())); DocsAndPositionsEnum termPositions = MultiFields.GetTermPositionsEnum(reader, reader.LiveDocs, "f1", new BytesRef("a")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = termPositions.Freq; Assert.AreEqual(3, freq); Assert.AreEqual(0, termPositions.NextPosition()); Assert.IsNotNull(termPositions.GetPayload()); Assert.AreEqual(6, termPositions.NextPosition()); Assert.IsNull(termPositions.GetPayload()); Assert.AreEqual(7, termPositions.NextPosition()); Assert.IsNull(termPositions.GetPayload()); reader.Dispose(); }
public virtual void TestTerms() { Fields fields = MultiFields.GetFields(Reader); foreach (string field in fields) { Terms terms = fields.GetTerms(field); Assert.IsNotNull(terms); TermsEnum termsEnum = terms.GetIterator(null); while (termsEnum.Next() != null) { BytesRef term = termsEnum.Term; Assert.IsTrue(term != null); string fieldValue = (string)DocHelper.NameValues[field]; Assert.IsTrue(fieldValue.IndexOf(term.Utf8ToString(), StringComparison.Ordinal) != -1); } } DocsEnum termDocs = TestUtil.Docs(Random, Reader, DocHelper.TEXT_FIELD_1_KEY, new BytesRef("field"), MultiFields.GetLiveDocs(Reader), null, 0); Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); termDocs = TestUtil.Docs(Random, Reader, DocHelper.NO_NORMS_KEY, new BytesRef(DocHelper.NO_NORMS_TEXT), MultiFields.GetLiveDocs(Reader), null, 0); Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); DocsAndPositionsEnum positions = MultiFields.GetTermPositionsEnum(Reader, MultiFields.GetLiveDocs(Reader), DocHelper.TEXT_FIELD_1_KEY, new BytesRef("field")); // NOTE: prior rev of this test was failing to first // call next here: Assert.IsTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.IsTrue(positions.DocID == 0); Assert.IsTrue(positions.NextPosition() >= 0); }
public virtual void TestPositionIncrementGap() { Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this); IndexWriter writer = new IndexWriter(Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); doc.Add(NewTextField("repeated", "repeated one", Field.Store.YES)); doc.Add(NewTextField("repeated", "repeated two", Field.Store.YES)); writer.AddDocument(doc); writer.Commit(); SegmentCommitInfo info = writer.NewestSegment(); writer.Dispose(); SegmentReader reader = new SegmentReader(info, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, NewIOContext(Random())); DocsAndPositionsEnum termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "repeated", new BytesRef("repeated")); Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = termPositions.Freq; Assert.AreEqual(2, freq); Assert.AreEqual(0, termPositions.NextPosition()); Assert.AreEqual(502, termPositions.NextPosition()); reader.Dispose(); }
public virtual void TestSeek() { Directory directory = NewDirectory(); IndexWriter writer = new IndexWriter(directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); for (int i = 0; i < 10; i++) { Document doc = new Document(); doc.Add(NewTextField(this.Field, "a b", Documents.Field.Store.YES)); writer.AddDocument(doc); } writer.Dispose(); IndexReader reader = DirectoryReader.Open(directory); DocsAndPositionsEnum tp = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), this.Field, new BytesRef("b")); for (int i = 0; i < 10; i++) { tp.NextDoc(); Assert.AreEqual(tp.DocID, i); Assert.AreEqual(tp.NextPosition(), 1); } tp = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), this.Field, new BytesRef("a")); for (int i = 0; i < 10; i++) { tp.NextDoc(); Assert.AreEqual(tp.DocID, i); Assert.AreEqual(tp.NextPosition(), 0); } reader.Dispose(); directory.Dispose(); }
public virtual void TestBasic() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; Field f = NewField("foo", "this is a test test", ft); doc.Add(f); for (int i = 0; i < 100; i++) { w.AddDocument(doc); } IndexReader reader = w.Reader; w.Dispose(); Assert.IsNull(MultiFields.GetTermPositionsEnum(reader, null, "foo", new BytesRef("test"))); DocsEnum de = TestUtil.Docs(Random(), reader, "foo", new BytesRef("test"), null, null, DocsFlags.FREQS); while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.AreEqual(2, de.Freq); } reader.Dispose(); dir.Dispose(); }
public virtual void TestBasic() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (Random.NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorPositions = Random.NextBoolean(); ft.StoreTermVectorOffsets = Random.NextBoolean(); } Token[] tokens = new Token[] { MakeToken("a", 1, 0, 6), MakeToken("b", 1, 8, 9), MakeToken("a", 1, 9, 17), MakeToken("c", 1, 19, 50) }; doc.Add(new Field("content", new CannedTokenStream(tokens), ft)); w.AddDocument(doc); IndexReader r = w.GetReader(); w.Dispose(); DocsAndPositionsEnum dp = MultiFields.GetTermPositionsEnum(r, null, "content", new BytesRef("a")); Assert.IsNotNull(dp); Assert.AreEqual(0, dp.NextDoc()); Assert.AreEqual(2, dp.Freq); Assert.AreEqual(0, dp.NextPosition()); Assert.AreEqual(0, dp.StartOffset); Assert.AreEqual(6, dp.EndOffset); Assert.AreEqual(2, dp.NextPosition()); Assert.AreEqual(9, dp.StartOffset); Assert.AreEqual(17, dp.EndOffset); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); dp = MultiFields.GetTermPositionsEnum(r, null, "content", new BytesRef("b")); Assert.IsNotNull(dp); Assert.AreEqual(0, dp.NextDoc()); Assert.AreEqual(1, dp.Freq); Assert.AreEqual(1, dp.NextPosition()); Assert.AreEqual(8, dp.StartOffset); Assert.AreEqual(9, dp.EndOffset); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); dp = MultiFields.GetTermPositionsEnum(r, null, "content", new BytesRef("c")); Assert.IsNotNull(dp); Assert.AreEqual(0, dp.NextDoc()); Assert.AreEqual(1, dp.Freq); Assert.AreEqual(3, dp.NextPosition()); Assert.AreEqual(19, dp.StartOffset); Assert.AreEqual(50, dp.EndOffset); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); r.Dispose(); dir.Dispose(); }
public virtual void DoTestNumbers(bool withPayloads) { Directory dir = NewDirectory(); Analyzer analyzer = withPayloads ? (Analyzer) new MockPayloadAnalyzer() : new MockAnalyzer(Random); iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.SetMergePolicy(NewLogMergePolicy()); // will rely on docids a bit for skipping RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (Random.NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = Random.NextBoolean(); ft.StoreTermVectorPositions = Random.NextBoolean(); } int numDocs = AtLeast(500); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new Field("numbers", English.Int32ToEnglish(i), ft)); doc.Add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft)); doc.Add(new StringField("id", "" + i, Field.Store.NO)); w.AddDocument(doc); } IndexReader reader = w.GetReader(); w.Dispose(); string[] terms = new string[] { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" }; foreach (string term in terms) { DocsAndPositionsEnum dp = MultiFields.GetTermPositionsEnum(reader, null, "numbers", new BytesRef(term)); int doc; while ((doc = dp.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { string storedNumbers = reader.Document(doc).Get("numbers"); int freq = dp.Freq; for (int i = 0; i < freq; i++) { dp.NextPosition(); int start = dp.StartOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(start >= 0); } int end = dp.EndOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(end >= 0 && end >= start); } // check that the offsets correspond to the term in the src text Assert.IsTrue(storedNumbers.Substring(start, end - start).Equals(term, StringComparison.Ordinal)); if (withPayloads) { // check that we have a payload and it starts with "pos" Assert.IsNotNull(dp.GetPayload()); BytesRef payload = dp.GetPayload(); Assert.IsTrue(payload.Utf8ToString().StartsWith("pos:", StringComparison.Ordinal)); } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer! } } } // check we can skip correctly int numSkippingTests = AtLeast(50); for (int j = 0; j < numSkippingTests; j++) { int num = TestUtil.NextInt32(Random, 100, Math.Min(numDocs - 1, 999)); DocsAndPositionsEnum dp = MultiFields.GetTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred")); int doc = dp.Advance(num); Assert.AreEqual(num, doc); int freq = dp.Freq; for (int i = 0; i < freq; i++) { string storedNumbers = reader.Document(doc).Get("numbers"); dp.NextPosition(); int start = dp.StartOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(start >= 0); } int end = dp.EndOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(end >= 0 && end >= start); } // check that the offsets correspond to the term in the src text Assert.IsTrue(storedNumbers.Substring(start, end - start).Equals("hundred", StringComparison.Ordinal)); if (withPayloads) { // check that we have a payload and it starts with "pos" Assert.IsNotNull(dp.GetPayload()); BytesRef payload = dp.GetPayload(); Assert.IsTrue(payload.Utf8ToString().StartsWith("pos:", StringComparison.Ordinal)); } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer! } } // check that other fields (without offsets) work correctly for (int i = 0; i < numDocs; i++) { DocsEnum dp = MultiFields.GetTermDocsEnum(reader, null, "id", new BytesRef("" + i), 0); Assert.AreEqual(i, dp.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); } reader.Dispose(); dir.Dispose(); }
public virtual void TestLongPostings_Mem() { // Don't use TestUtil.getTempDir so that we own the // randomness (ie same seed will point to same dir): Directory dir = NewFSDirectory(CreateTempDir("longpostings" + "." + Random.NextInt64())); int NUM_DOCS = AtLeast(2000); if (VERBOSE) { Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS); } string s1 = GetRandomTerm(null); string s2 = GetRandomTerm(s1); if (VERBOSE) { Console.WriteLine("\nTEST: s1=" + s1 + " s2=" + s2); /* * for(int idx=0;idx<s1.Length();idx++) { * System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx))); * } * for(int idx=0;idx<s2.Length();idx++) { * System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx))); * } */ } FixedBitSet isS1 = new FixedBitSet(NUM_DOCS); for (int idx = 0; idx < NUM_DOCS; idx++) { if (Random.NextBoolean()) { isS1.Set(idx); } } IndexReader r; IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetOpenMode(OpenMode.CREATE).SetMergePolicy(NewLogMergePolicy()); iwc.SetRAMBufferSizeMB(16.0 + 16.0 * Random.NextDouble()); iwc.SetMaxBufferedDocs(-1); RandomIndexWriter riw = new RandomIndexWriter(Random, dir, iwc); for (int idx = 0; idx < NUM_DOCS; idx++) { Document doc = new Document(); string s = isS1.Get(idx) ? s1 : s2; Field f = NewTextField("field", s, Field.Store.NO); int count = TestUtil.NextInt32(Random, 1, 4); for (int ct = 0; ct < count; ct++) { doc.Add(f); } riw.AddDocument(doc); } r = riw.GetReader(); riw.Dispose(); /* * if (VERBOSE) { * System.out.println("TEST: terms"); * TermEnum termEnum = r.Terms(); * while(termEnum.Next()) { * System.out.println(" term=" + termEnum.Term() + " len=" + termEnum.Term().Text().Length()); * Assert.IsTrue(termEnum.DocFreq() > 0); * System.out.println(" s1?=" + (termEnum.Term().Text().equals(s1)) + " s1len=" + s1.Length()); * System.out.println(" s2?=" + (termEnum.Term().Text().equals(s2)) + " s2len=" + s2.Length()); * final String s = termEnum.Term().Text(); * for(int idx=0;idx<s.Length();idx++) { * System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx))); * } * } * } */ Assert.AreEqual(NUM_DOCS, r.NumDocs); Assert.IsTrue(r.DocFreq(new Term("field", s1)) > 0); Assert.IsTrue(r.DocFreq(new Term("field", s2)) > 0); int num = AtLeast(1000); for (int iter = 0; iter < num; iter++) { string term; bool doS1; if (Random.NextBoolean()) { term = s1; doS1 = true; } else { term = s2; doS1 = false; } if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " doS1=" + doS1); } DocsAndPositionsEnum postings = MultiFields.GetTermPositionsEnum(r, null, "field", new BytesRef(term)); int docID = -1; while (docID < DocIdSetIterator.NO_MORE_DOCS) { int what = Random.Next(3); if (what == 0) { if (VERBOSE) { Console.WriteLine("TEST: docID=" + docID + "; do next()"); } // nextDoc int expected = docID + 1; while (true) { if (expected == NUM_DOCS) { expected = int.MaxValue; break; } else if (isS1.Get(expected) == doS1) { break; } else { expected++; } } docID = postings.NextDoc(); if (VERBOSE) { Console.WriteLine(" got docID=" + docID); } Assert.AreEqual(expected, docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (Random.Next(6) == 3) { int freq = postings.Freq; Assert.IsTrue(freq >= 1 && freq <= 4); for (int pos = 0; pos < freq; pos++) { Assert.AreEqual(pos, postings.NextPosition()); if (Random.NextBoolean()) { var dummy = postings.GetPayload(); if (Random.NextBoolean()) { dummy = postings.GetPayload(); // get it again } } } } } else { // advance int targetDocID; if (docID == -1) { targetDocID = Random.Next(NUM_DOCS + 1); } else { targetDocID = docID + TestUtil.NextInt32(Random, 1, NUM_DOCS - docID); } if (VERBOSE) { Console.WriteLine("TEST: docID=" + docID + "; do advance(" + targetDocID + ")"); } int expected = targetDocID; while (true) { if (expected == NUM_DOCS) { expected = int.MaxValue; break; } else if (isS1.Get(expected) == doS1) { break; } else { expected++; } } docID = postings.Advance(targetDocID); if (VERBOSE) { Console.WriteLine(" got docID=" + docID); } Assert.AreEqual(expected, docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (Random.Next(6) == 3) { int freq = postings.Freq; Assert.IsTrue(freq >= 1 && freq <= 4); for (int pos = 0; pos < freq; pos++) { Assert.AreEqual(pos, postings.NextPosition()); if (Random.NextBoolean()) { var dummy = postings.GetPayload(); if (Random.NextBoolean()) { dummy = postings.GetPayload(); // get it again } } } } } } } r.Dispose(); dir.Dispose(); }
// builds an index with payloads in the given Directory and performs // different tests to verify the payload encoding private void PerformTest(Directory dir) { PayloadAnalyzer analyzer = new PayloadAnalyzer(); IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetOpenMode(OpenMode.CREATE).SetMergePolicy(NewLogMergePolicy())); // should be in sync with value in TermInfosWriter const int skipInterval = 16; const int numTerms = 5; const string fieldName = "f1"; int numDocs = skipInterval + 1; // create content for the test documents with just a few terms Term[] terms = GenerateTerms(fieldName, numTerms); StringBuilder sb = new StringBuilder(); for (int i = 0; i < terms.Length; i++) { sb.Append(terms[i].Text()); sb.Append(" "); } string content = sb.ToString(); int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2; var payloadData = GenerateRandomData(payloadDataLength); Document d = new Document(); d.Add(NewTextField(fieldName, content, Field.Store.NO)); // add the same document multiple times to have the same payload lengths for all // occurrences within two consecutive skip intervals int offset = 0; for (int i = 0; i < 2 * numDocs; i++) { analyzer = new PayloadAnalyzer(fieldName, payloadData, offset, 1); offset += numTerms; writer.AddDocument(d, analyzer); } // make sure we create more than one segment to test merging writer.Commit(); // now we make sure to have different payload lengths next at the next skip point for (int i = 0; i < numDocs; i++) { analyzer = new PayloadAnalyzer(fieldName, payloadData, offset, i); offset += i * numTerms; writer.AddDocument(d, analyzer); } writer.ForceMerge(1); // flush writer.Dispose(); /* * Verify the index * first we test if all payloads are stored correctly */ IndexReader reader = DirectoryReader.Open(dir); var verifyPayloadData = new byte[payloadDataLength]; offset = 0; var tps = new DocsAndPositionsEnum[numTerms]; for (int i = 0; i < numTerms; i++) { tps[i] = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), terms[i].Field, new BytesRef(terms[i].Text())); } while (tps[0].NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 1; i < numTerms; i++) { tps[i].NextDoc(); } int freq = tps[0].Freq; for (int i = 0; i < freq; i++) { for (int j = 0; j < numTerms; j++) { tps[j].NextPosition(); BytesRef br = tps[j].GetPayload(); if (br != null) { Array.Copy(br.Bytes, br.Offset, verifyPayloadData, offset, br.Length); offset += br.Length; } } } } AssertByteArrayEquals(payloadData, verifyPayloadData); /* * test lazy skipping */ DocsAndPositionsEnum tp = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), terms[0].Field, new BytesRef(terms[0].Text())); tp.NextDoc(); tp.NextPosition(); // NOTE: prior rev of this test was failing to first // call next here: tp.NextDoc(); // now we don't read this payload tp.NextPosition(); BytesRef payload = tp.GetPayload(); Assert.AreEqual(1, payload.Length, "Wrong payload length."); Assert.AreEqual(payload.Bytes[payload.Offset], payloadData[numTerms]); tp.NextDoc(); tp.NextPosition(); // we don't read this payload and skip to a different document tp.Advance(5); tp.NextPosition(); payload = tp.GetPayload(); Assert.AreEqual(1, payload.Length, "Wrong payload length."); Assert.AreEqual(payload.Bytes[payload.Offset], payloadData[5 * numTerms]); /* * Test different lengths at skip points */ tp = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), terms[1].Field, new BytesRef(terms[1].Text())); tp.NextDoc(); tp.NextPosition(); Assert.AreEqual(1, tp.GetPayload().Length, "Wrong payload length."); tp.Advance(skipInterval - 1); tp.NextPosition(); Assert.AreEqual(1, tp.GetPayload().Length, "Wrong payload length."); tp.Advance(2 * skipInterval - 1); tp.NextPosition(); Assert.AreEqual(1, tp.GetPayload().Length, "Wrong payload length."); tp.Advance(3 * skipInterval - 1); tp.NextPosition(); Assert.AreEqual(3 * skipInterval - 2 * numDocs - 1, tp.GetPayload().Length, "Wrong payload length."); reader.Dispose(); // test long payload analyzer = new PayloadAnalyzer(); writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetOpenMode(OpenMode.CREATE)); string singleTerm = "lucene"; d = new Document(); d.Add(NewTextField(fieldName, singleTerm, Field.Store.NO)); // add a payload whose length is greater than the buffer size of BufferedIndexOutput payloadData = GenerateRandomData(2000); analyzer.SetPayloadData(fieldName, payloadData, 100, 1500); writer.AddDocument(d); writer.ForceMerge(1); // flush writer.Dispose(); reader = DirectoryReader.Open(dir); tp = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), fieldName, new BytesRef(singleTerm)); tp.NextDoc(); tp.NextPosition(); BytesRef bref = tp.GetPayload(); verifyPayloadData = new byte[bref.Length]; var portion = new byte[1500]; Array.Copy(payloadData, 100, portion, 0, 1500); AssertByteArrayEquals(portion, bref.Bytes, bref.Offset, bref.Length); reader.Dispose(); }