private static Document Doc(int id, PositionsTokenStream positions) { Document doc = new Document(); doc.Add(new StringField(ID_FIELD, id.ToString(), Field.Store.YES)); doc.Add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Field.Store.NO)); positions.SetId(id); if (DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD))) { // codec doesnt support offsets: just index positions for the field doc.Add(new Field(DOC_POSITIONS_FIELD, positions, TextField.TYPE_NOT_STORED)); } else { doc.Add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE)); } doc.Add(new NumericDocValuesField(NUMERIC_DV_FIELD, id)); TextField norms = new TextField(NORMS_FIELD, id.ToString(), Field.Store.NO); norms.Boost = (Number.Int32BitsToSingle(id)); doc.Add(norms); doc.Add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(id.ToString()))); doc.Add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(id.ToString()))); if (DefaultCodecSupportsSortedSet) { doc.Add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(id.ToString()))); doc.Add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef((id + 1).ToString()))); } doc.Add(new Field(TERM_VECTORS_FIELD, id.ToString(), TERM_VECTORS_TYPE)); return(doc); }
public virtual void TestDocsAndPositionsEnum() { TermsEnum termsEnum = reader.GetTerms(DOC_POSITIONS_FIELD).GetIterator(null); assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOC_POSITIONS_TERM))); DocsAndPositionsEnum sortedPositions = termsEnum.DocsAndPositions(null, null); int doc; // test nextDoc() while ((doc = sortedPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.Freq; assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition()); if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset); } assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture)); } } // test advance() DocsAndPositionsEnum reuse = sortedPositions; sortedPositions = termsEnum.DocsAndPositions(null, reuse); if (sortedPositions is SortingAtomicReader.SortingDocsAndPositionsEnum) { assertTrue(((SortingAtomicReader.SortingDocsAndPositionsEnum)sortedPositions).Reused(reuse)); // make sure reuse worked } doc = 0; while ((doc = sortedPositions.Advance(doc + TestUtil.NextInt32(Random, 1, 5))) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.Freq; assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition()); if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset); } assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture)); } } }
public virtual void Test() { IList <string> postingsList = new List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt(Random(), 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field")); IndexWriterConfig iwc = NewIndexWriterConfig(Random(), TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TEST_NIGHTLY || RANDOM_MULTIPLIER > 1)) { // Otherwise test can take way too long (> 2 hours) numTerms /= 2; } if (VERBOSE) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList = CollectionsHelper.Shuffle(postingsList); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir(GetFullMethodName())); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc); int threadCount = TestUtil.NextInt(Random(), 1, 5); if (VERBOSE) { Console.WriteLine("config: " + iw.w.Config); Console.WriteLine("threadCount=" + threadCount); } Field prototype = NewTextField("field", "", Field.Store.NO); FieldType fieldType = new FieldType((FieldType)prototype.FieldType); if (Random().NextBoolean()) { fieldType.OmitNorms = true; } int options = Random().Next(3); if (options == 0) { fieldType.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; // we dont actually need positions fieldType.StoreTermVectors = true; // but enforce term vectors when we do this so we check SOMETHING } else if (options == 1 && !DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat("field"))) { fieldType.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } // else just positions ThreadClass[] threads = new ThreadClass[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { Random threadRandom = new Random(Random().Next()); Document document = new Document(); Field field = new Field("field", "", fieldType); document.Add(field); threads[threadID] = new ThreadAnonymousInnerClassHelper(this, numTerms, maxTermsPerDoc, postings, iw, startingGun, threadRandom, document, field); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadClass t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.Reader; Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.Terms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: Assert.AreEqual(numTerms - 1, terms.Size()); TermsEnum termsEnum = terms.Iterator(null); BytesRef termBR; while ((termBR = termsEnum.Next()) != null) { int value = Convert.ToInt32(termBR.Utf8ToString()); Assert.AreEqual(value, termsEnum.TotalTermFreq()); // don't really need to check more than this, as CheckIndex // will verify that totalTermFreq == total number of positions seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
public virtual void Test() { IList <string> postingsList = new JCG.List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt32(Random, 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field"), StringComparison.Ordinal); IndexWriterConfig iwc = NewIndexWriterConfig(Random, TEST_VERSION_CURRENT, new MockAnalyzer(Random)); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TestNightly || RandomMultiplier > 1)) { // Otherwise test can take way too long (> 2 hours) //numTerms /= 2; // LUCENENET specific - To keep this under the 1 hour free limit // of Azure DevOps, this was reduced from /2 to /6. numTerms /= 6; } if (Verbose) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i, CultureInfo.InvariantCulture); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList.Shuffle(Random); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir(GetFullMethodName())); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); int threadCount = TestUtil.NextInt32(Random, 1, 5); if (Verbose) { Console.WriteLine("config: " + iw.IndexWriter.Config); Console.WriteLine("threadCount=" + threadCount); } Field prototype = NewTextField("field", "", Field.Store.NO); FieldType fieldType = new FieldType(prototype.FieldType); if (Random.NextBoolean()) { fieldType.OmitNorms = true; } int options = Random.Next(3); if (options == 0) { fieldType.IndexOptions = IndexOptions.DOCS_AND_FREQS; // we dont actually need positions fieldType.StoreTermVectors = true; // but enforce term vectors when we do this so we check SOMETHING } else if (options == 1 && !DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat("field"))) { fieldType.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } // else just positions ThreadJob[] threads = new ThreadJob[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { Random threadRandom = new Random(Random.Next()); Document document = new Document(); Field field = new Field("field", "", fieldType); document.Add(field); threads[threadID] = new ThreadAnonymousClass(this, numTerms, maxTermsPerDoc, postings, iw, startingGun, threadRandom, document, field); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadJob t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.GetReader(); Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.GetTerms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: Assert.AreEqual(numTerms - 1, terms.Count); TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { int value = Convert.ToInt32(termsEnum.Term.Utf8ToString(), CultureInfo.InvariantCulture); Assert.AreEqual(value, termsEnum.TotalTermFreq); // don't really need to check more than this, as CheckIndex // will verify that totalTermFreq == total number of positions seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
private static void CheckRandomData(Random random, Analyzer a, int iterations, int maxWordLength, bool useCharFilter, bool simple, bool offsetsAreCorrect, RandomIndexWriter iw) { LineFileDocs docs = new LineFileDocs(random); Document doc = null; Field field = null, currentField = null; StringReader bogus = new StringReader(""); if (iw != null) { doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); if (random.NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = random.NextBoolean(); ft.StoreTermVectorPositions = random.NextBoolean(); if (ft.StoreTermVectorPositions && !OLD_FORMAT_IMPERSONATION_IS_ACTIVE) { ft.StoreTermVectorPayloads = random.NextBoolean(); } } if (random.NextBoolean()) { ft.OmitNorms = true; } string pf = TestUtil.GetPostingsFormat("dummy"); bool supportsOffsets = !DoesntSupportOffsets.Contains(pf); switch (random.Next(4)) { case 0: ft.IndexOptions = FieldInfo.IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; break; case 2: ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; default: if (supportsOffsets && offsetsAreCorrect) { ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } else { ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; } break; } currentField = field = new Field("dummy", bogus, ft); doc.Add(currentField); } try { for (int i = 0; i < iterations; i++) { string text; if (random.Next(10) == 7) { // real data from linedocs text = docs.NextDoc().Get("body"); if (text.Length > maxWordLength) { // Take a random slice from the text...: int startPos = random.Next(text.Length - maxWordLength); if (startPos > 0 && char.IsLowSurrogate(text[startPos])) { // Take care not to split up a surrogate pair: startPos--; Assert.True(char.IsHighSurrogate(text[startPos])); } int endPos = startPos + maxWordLength - 1; if (char.IsHighSurrogate(text[endPos])) { // Take care not to split up a surrogate pair: endPos--; } text = text.Substring(startPos, 1 + endPos - startPos); } } else { // synthetic text = TestUtil.RandomAnalysisString(random, maxWordLength, simple); } try { CheckAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField); if (iw != null) { if (random.Next(7) == 0) { // pile up a multivalued field var ft = (FieldType)field.FieldType(); currentField = new Field("dummy", bogus, ft); doc.Add(currentField); } else { iw.AddDocument(doc); if (doc.Fields.Count > 1) { // back to 1 field currentField = field; doc.RemoveFields("dummy"); doc.Add(currentField); } } } } catch (Exception t) { // TODO: really we should pass a random seed to // checkAnalysisConsistency then print it here too: Console.Error.WriteLine("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + Escape(text) + "'"); throw; } } } finally { IOUtils.CloseWhileHandlingException(docs); } }