internal virtual void AssertQuery(Query query, Filter filter, Sort sort) { int maxDoc = searcher.IndexReader.MaxDoc; TopDocs all; int pageSize = TestUtil.NextInt32(Random, 1, maxDoc * 2); if (isVerbose) { Console.WriteLine("\nassertQuery " + (iter++) + ": query=" + query + " filter=" + filter + " sort=" + sort + " pageSize=" + pageSize); } bool doMaxScore = Random.NextBoolean(); bool doScores = Random.NextBoolean(); if (sort == null) { all = searcher.Search(query, filter, maxDoc); } else if (sort == Sort.RELEVANCE) { all = searcher.Search(query, filter, maxDoc, sort, true, doMaxScore); } else { all = searcher.Search(query, filter, maxDoc, sort, doScores, doMaxScore); } if (isVerbose) { Console.WriteLine(" all.TotalHits=" + all.TotalHits); int upto = 0; foreach (ScoreDoc scoreDoc in all.ScoreDocs) { Console.WriteLine(" hit " + (upto++) + ": id=" + searcher.Doc(scoreDoc.Doc).Get("id") + " " + scoreDoc); } } int pageStart = 0; ScoreDoc lastBottom = null; while (pageStart < all.TotalHits) { TopDocs paged; if (sort == null) { if (isVerbose) { Console.WriteLine(" iter lastBottom=" + lastBottom); } paged = searcher.SearchAfter(lastBottom, query, filter, pageSize); } else { if (isVerbose) { Console.WriteLine(" iter lastBottom=" + lastBottom); } if (sort == Sort.RELEVANCE) { paged = searcher.SearchAfter(lastBottom, query, filter, pageSize, sort, true, doMaxScore); } else { paged = searcher.SearchAfter(lastBottom, query, filter, pageSize, sort, doScores, doMaxScore); } } if (isVerbose) { Console.WriteLine(" " + paged.ScoreDocs.Length + " hits on page"); } if (paged.ScoreDocs.Length == 0) { break; } AssertPage(pageStart, all, paged); pageStart += paged.ScoreDocs.Length; lastBottom = paged.ScoreDocs[paged.ScoreDocs.Length - 1]; } Assert.AreEqual(all.ScoreDocs.Length, pageStart); }
public override void SetUp() { base.SetUp(); // LUCENENET specific: Moved this logic here to ensure that it is executed // after the class is setup - a field is way to early to execute this. bool supportsDocValues = Codec.Default.Name.Equals("Lucene3x", StringComparison.Ordinal) == false; allSortFields = new List <SortField> { #pragma warning disable 612,618 new SortField("byte", SortFieldType.BYTE, false), new SortField("short", SortFieldType.INT16, false), #pragma warning restore 612,618 new SortField("int", SortFieldType.INT32, false), new SortField("long", SortFieldType.INT64, false), new SortField("float", SortFieldType.SINGLE, false), new SortField("double", SortFieldType.DOUBLE, false), new SortField("bytes", SortFieldType.STRING, false), new SortField("bytesval", SortFieldType.STRING_VAL, false), #pragma warning disable 612,618 new SortField("byte", SortFieldType.BYTE, true), new SortField("short", SortFieldType.INT16, true), #pragma warning restore 612,618 new SortField("int", SortFieldType.INT32, true), new SortField("long", SortFieldType.INT64, true), new SortField("float", SortFieldType.SINGLE, true), new SortField("double", SortFieldType.DOUBLE, true), new SortField("bytes", SortFieldType.STRING, true), new SortField("bytesval", SortFieldType.STRING_VAL, true), SortField.FIELD_SCORE, SortField.FIELD_DOC }; if (supportsDocValues) { allSortFields.AddRange(new SortField[] { new SortField("intdocvalues", SortFieldType.INT32, false), new SortField("floatdocvalues", SortFieldType.SINGLE, false), new SortField("sortedbytesdocvalues", SortFieldType.STRING, false), new SortField("sortedbytesdocvaluesval", SortFieldType.STRING_VAL, false), new SortField("straightbytesdocvalues", SortFieldType.STRING_VAL, false), new SortField("intdocvalues", SortFieldType.INT32, true), new SortField("floatdocvalues", SortFieldType.SINGLE, true), new SortField("sortedbytesdocvalues", SortFieldType.STRING, true), new SortField("sortedbytesdocvaluesval", SortFieldType.STRING_VAL, true), new SortField("straightbytesdocvalues", SortFieldType.STRING_VAL, true) }); } // Also test missing first / last for the "string" sorts: foreach (string field in new string[] { "bytes", "sortedbytesdocvalues" }) { for (int rev = 0; rev < 2; rev++) { bool reversed = rev == 0; SortField sf = new SortField(field, SortFieldType.STRING, reversed); sf.MissingValue = SortField.STRING_FIRST; allSortFields.Add(sf); sf = new SortField(field, SortFieldType.STRING, reversed); sf.MissingValue = SortField.STRING_LAST; allSortFields.Add(sf); } } int limit = allSortFields.Count; for (int i = 0; i < limit; i++) { SortField sf = allSortFields[i]; if (sf.Type == SortFieldType.INT32) { SortField sf2 = new SortField(sf.Field, SortFieldType.INT32, sf.IsReverse); sf2.MissingValue = Random.Next(); allSortFields.Add(sf2); } else if (sf.Type == SortFieldType.INT64) { SortField sf2 = new SortField(sf.Field, SortFieldType.INT64, sf.IsReverse); sf2.MissingValue = Random.NextInt64(); allSortFields.Add(sf2); } else if (sf.Type == SortFieldType.SINGLE) { SortField sf2 = new SortField(sf.Field, SortFieldType.SINGLE, sf.IsReverse); sf2.MissingValue = (float)Random.NextDouble(); allSortFields.Add(sf2); } else if (sf.Type == SortFieldType.DOUBLE) { SortField sf2 = new SortField(sf.Field, SortFieldType.DOUBLE, sf.IsReverse); sf2.MissingValue = Random.NextDouble(); allSortFields.Add(sf2); } } dir = NewDirectory(); RandomIndexWriter iw = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); int numDocs = AtLeast(200); for (int i = 0; i < numDocs; i++) { IList <Field> fields = new List <Field>(); fields.Add(NewTextField("english", English.Int32ToEnglish(i), Field.Store.NO)); fields.Add(NewTextField("oddeven", (i % 2 == 0) ? "even" : "odd", Field.Store.NO)); fields.Add(NewStringField("byte", "" + ((sbyte)Random.Next()).ToString(CultureInfo.InvariantCulture), Field.Store.NO)); fields.Add(NewStringField("short", "" + ((short)Random.Next()).ToString(CultureInfo.InvariantCulture), Field.Store.NO)); fields.Add(new Int32Field("int", Random.Next(), Field.Store.NO)); fields.Add(new Int64Field("long", Random.NextInt64(), Field.Store.NO)); fields.Add(new SingleField("float", (float)Random.NextDouble(), Field.Store.NO)); fields.Add(new DoubleField("double", Random.NextDouble(), Field.Store.NO)); fields.Add(NewStringField("bytes", TestUtil.RandomRealisticUnicodeString(Random), Field.Store.NO)); fields.Add(NewStringField("bytesval", TestUtil.RandomRealisticUnicodeString(Random), Field.Store.NO)); fields.Add(new DoubleField("double", Random.NextDouble(), Field.Store.NO)); if (supportsDocValues) { fields.Add(new NumericDocValuesField("intdocvalues", Random.Next())); fields.Add(new SingleDocValuesField("floatdocvalues", (float)Random.NextDouble())); fields.Add(new SortedDocValuesField("sortedbytesdocvalues", new BytesRef(TestUtil.RandomRealisticUnicodeString(Random)))); fields.Add(new SortedDocValuesField("sortedbytesdocvaluesval", new BytesRef(TestUtil.RandomRealisticUnicodeString(Random)))); fields.Add(new BinaryDocValuesField("straightbytesdocvalues", new BytesRef(TestUtil.RandomRealisticUnicodeString(Random)))); } Document document = new Document(); document.Add(new StoredField("id", "" + i)); if (isVerbose) { Console.WriteLine(" add doc id=" + i); } foreach (Field field in fields) { // So we are sometimes missing that field: if (Random.Next(5) != 4) { document.Add(field); if (isVerbose) { Console.WriteLine(" " + field); } } } iw.AddDocument(document); if (Random.Next(50) == 17) { iw.Commit(); } } reader = iw.GetReader(); iw.Dispose(); searcher = NewSearcher(reader); if (isVerbose) { Console.WriteLine(" searcher=" + searcher); } }
public virtual void TestRollingUpdates_Mem() { Random random = new J2N.Randomizer(Random.NextInt64()); BaseDirectoryWrapper dir = NewDirectory(); LineFileDocs docs = new LineFileDocs(random, DefaultCodecSupportsDocValues); //provider.register(new MemoryCodec()); if ((!"Lucene3x".Equals(Codec.Default.Name, StringComparison.Ordinal)) && LuceneTestCase.Random.NextBoolean()) { Codec.Default = TestUtil.AlwaysPostingsFormat(new MemoryPostingsFormat(LuceneTestCase.Random.nextBoolean(), random.NextSingle())); } MockAnalyzer analyzer = new MockAnalyzer(LuceneTestCase.Random); analyzer.MaxTokenLength = TestUtil.NextInt32(LuceneTestCase.Random, 1, IndexWriter.MAX_TERM_LENGTH); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); int SIZE = AtLeast(20); int id = 0; IndexReader r = null; IndexSearcher s = null; int numUpdates = (int)(SIZE * (2 + (TestNightly ? 200 * LuceneTestCase.Random.NextDouble() : 5 * LuceneTestCase.Random.NextDouble()))); if (Verbose) { Console.WriteLine("TEST: numUpdates=" + numUpdates); } int updateCount = 0; // TODO: sometimes update ids not in order... for (int docIter = 0; docIter < numUpdates; docIter++) { Documents.Document doc = docs.NextDoc(); string myID = "" + id; if (id == SIZE - 1) { id = 0; } else { id++; } if (Verbose) { Console.WriteLine(" docIter=" + docIter + " id=" + id); } ((Field)doc.GetField("docid")).SetStringValue(myID); Term idTerm = new Term("docid", myID); bool doUpdate; if (s != null && updateCount < SIZE) { TopDocs hits = s.Search(new TermQuery(idTerm), 1); Assert.AreEqual(1, hits.TotalHits); doUpdate = !w.TryDeleteDocument(r, hits.ScoreDocs[0].Doc); if (Verbose) { if (doUpdate) { Console.WriteLine(" tryDeleteDocument failed"); } else { Console.WriteLine(" tryDeleteDocument succeeded"); } } } else { doUpdate = true; if (Verbose) { Console.WriteLine(" no searcher: doUpdate=true"); } } updateCount++; if (doUpdate) { w.UpdateDocument(idTerm, doc); } else { w.AddDocument(doc); } if (docIter >= SIZE && LuceneTestCase.Random.Next(50) == 17) { if (r != null) { r.Dispose(); } bool applyDeletions = LuceneTestCase.Random.NextBoolean(); if (Verbose) { Console.WriteLine("TEST: reopen applyDeletions=" + applyDeletions); } r = w.GetReader(applyDeletions); if (applyDeletions) { s = NewSearcher(r); } else { s = null; } Assert.IsTrue(!applyDeletions || r.NumDocs == SIZE, "applyDeletions=" + applyDeletions + " r.NumDocs=" + r.NumDocs + " vs SIZE=" + SIZE); updateCount = 0; } } if (r != null) { r.Dispose(); } w.Commit(); Assert.AreEqual(SIZE, w.NumDocs); w.Dispose(); TestIndexWriter.AssertNoUnreferencedFiles(dir, "leftover files after rolling updates"); docs.Dispose(); // LUCENE-4455: SegmentInfos infos = new SegmentInfos(); infos.Read(dir); long totalBytes = 0; foreach (SegmentCommitInfo sipc in infos.Segments) { totalBytes += sipc.GetSizeInBytes(); } long totalBytes2 = 0; foreach (string fileName in dir.ListAll()) { if (!fileName.StartsWith(IndexFileNames.SEGMENTS, StringComparison.Ordinal)) { totalBytes2 += dir.FileLength(fileName); } } Assert.AreEqual(totalBytes2, totalBytes); dir.Dispose(); }
public virtual void TestFlushExceptions() { MockDirectoryWrapper directory = NewMockDirectory(); FailOnlyOnFlush failure = new FailOnlyOnFlush(this); directory.FailOn(failure); IndexWriter writer = new IndexWriter(directory, (IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMaxBufferedDocs(2)); Document doc = new Document(); Field idField = NewStringField("id", "", Field.Store.YES); doc.Add(idField); int extraCount = 0; for (int i = 0; i < 10; i++) { if (Verbose) { Console.WriteLine("TEST: iter=" + i); } for (int j = 0; j < 20; j++) { idField.SetStringValue(Convert.ToString(i * 20 + j)); writer.AddDocument(doc); } // must cycle here because sometimes the merge flushes // the doc we just added and so there's nothing to // flush, and we don't hit the exception while (true) { writer.AddDocument(doc); failure.SetDoFail(); try { writer.Flush(true, true); if (failure.hitExc) { Assert.Fail("failed to hit IOException"); } extraCount++; } catch (IOException ioe) { if (Verbose) { Console.WriteLine(ioe.StackTrace); } failure.ClearDoFail(); break; } } Assert.AreEqual(20 * (i + 1) + extraCount, writer.NumDocs); } writer.Dispose(); IndexReader reader = DirectoryReader.Open(directory); Assert.AreEqual(200 + extraCount, reader.NumDocs); reader.Dispose(); directory.Dispose(); }
private IndexIterationContext CreateContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, bool multipleValuesPerDocument, bool scoreDocsInOrder) { IndexIterationContext context = new IndexIterationContext(); int numRandomValues = nDocs / 2; context.RandomUniqueValues = new string[numRandomValues]; ISet <string> trackSet = new JCG.HashSet <string>(); context.RandomFrom = new bool[numRandomValues]; for (int i = 0; i < numRandomValues; i++) { string uniqueRandomValue; do { uniqueRandomValue = TestUtil.RandomRealisticUnicodeString(Random); // uniqueRandomValue = TestUtil.randomSimpleString(random); } while ("".Equals(uniqueRandomValue, StringComparison.Ordinal) || trackSet.Contains(uniqueRandomValue)); // Generate unique values and empty strings aren't allowed. trackSet.Add(uniqueRandomValue); context.RandomFrom[i] = Random.NextBoolean(); context.RandomUniqueValues[i] = uniqueRandomValue; } RandomDoc[] docs = new RandomDoc[nDocs]; for (int i = 0; i < nDocs; i++) { string id = Convert.ToString(i); int randomI = Random.Next(context.RandomUniqueValues.Length); string value = context.RandomUniqueValues[randomI]; Document document = new Document(); document.Add(NewTextField(Random, "id", id, Field.Store.NO)); document.Add(NewTextField(Random, "value", value, Field.Store.NO)); bool from = context.RandomFrom[randomI]; int numberOfLinkValues = multipleValuesPerDocument ? 2 + Random.Next(10) : 1; docs[i] = new RandomDoc(id, numberOfLinkValues, value, from); for (int j = 0; j < numberOfLinkValues; j++) { string linkValue = context.RandomUniqueValues[Random.Next(context.RandomUniqueValues.Length)]; docs[i].LinkValues.Add(linkValue); if (from) { if (!context.FromDocuments.TryGetValue(linkValue, out IList <RandomDoc> fromDocs)) { context.FromDocuments[linkValue] = fromDocs = new List <RandomDoc>(); } if (!context.RandomValueFromDocs.TryGetValue(value, out IList <RandomDoc> randomValueFromDocs)) { context.RandomValueFromDocs[value] = randomValueFromDocs = new List <RandomDoc>(); } fromDocs.Add(docs[i]); randomValueFromDocs.Add(docs[i]); document.Add(NewTextField(Random, "from", linkValue, Field.Store.NO)); } else { if (!context.ToDocuments.TryGetValue(linkValue, out IList <RandomDoc> toDocuments)) { context.ToDocuments[linkValue] = toDocuments = new List <RandomDoc>(); } if (!context.RandomValueToDocs.TryGetValue(value, out IList <RandomDoc> randomValueToDocs)) { context.RandomValueToDocs[value] = randomValueToDocs = new List <RandomDoc>(); } toDocuments.Add(docs[i]); randomValueToDocs.Add(docs[i]); document.Add(NewTextField(Random, "to", linkValue, Field.Store.NO)); } } RandomIndexWriter w; if (from) { w = fromWriter; } else { w = toWriter; } w.AddDocument(document); if (Random.Next(10) == 4) { w.Commit(); } if (VERBOSE) { Console.WriteLine("Added document[" + docs[i].Id + "]: " + document); } } // Pre-compute all possible hits for all unique random values. On top of this also compute all possible score for // any ScoreMode. IndexSearcher fromSearcher = NewSearcher(fromWriter.GetReader()); IndexSearcher toSearcher = NewSearcher(toWriter.GetReader()); for (int i = 0; i < context.RandomUniqueValues.Length; i++) { string uniqueRandomValue = context.RandomUniqueValues[i]; string fromField; string toField; IDictionary <string, IDictionary <int, JoinScore> > queryVals; if (context.RandomFrom[i]) { fromField = "from"; toField = "to"; queryVals = context.FromHitsToJoinScore; } else { fromField = "to"; toField = "from"; queryVals = context.ToHitsToJoinScore; } IDictionary <BytesRef, JoinScore> joinValueToJoinScores = new Dictionary <BytesRef, JoinScore>(); if (multipleValuesPerDocument) { fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)), new CollectorAnonymousInnerClassHelper3(this, context, fromField, joinValueToJoinScores)); } else { fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)), new CollectorAnonymousInnerClassHelper4(this, context, fromField, joinValueToJoinScores)); } IDictionary <int, JoinScore> docToJoinScore = new Dictionary <int, JoinScore>(); if (multipleValuesPerDocument) { if (scoreDocsInOrder) { AtomicReader slowCompositeReader = SlowCompositeReaderWrapper.Wrap(toSearcher.IndexReader); Terms terms = slowCompositeReader.GetTerms(toField); if (terms != null) { DocsEnum docsEnum = null; TermsEnum termsEnum = null; JCG.SortedSet <BytesRef> joinValues = new JCG.SortedSet <BytesRef>(BytesRef.UTF8SortedAsUnicodeComparer); joinValues.UnionWith(joinValueToJoinScores.Keys); foreach (BytesRef joinValue in joinValues) { termsEnum = terms.GetIterator(termsEnum); if (termsEnum.SeekExact(joinValue)) { docsEnum = termsEnum.Docs(slowCompositeReader.LiveDocs, docsEnum, DocsFlags.NONE); JoinScore joinScore = joinValueToJoinScores[joinValue]; for (int doc = docsEnum.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.NextDoc()) { // First encountered join value determines the score. // Something to keep in mind for many-to-many relations. if (!docToJoinScore.ContainsKey(doc)) { docToJoinScore[doc] = joinScore; } } } } } } else { toSearcher.Search(new MatchAllDocsQuery(), new CollectorAnonymousInnerClassHelper5(this, context, toField, joinValueToJoinScores, docToJoinScore)); } } else { toSearcher.Search(new MatchAllDocsQuery(), new CollectorAnonymousInnerClassHelper6(this, toField, joinValueToJoinScores, docToJoinScore)); } queryVals[uniqueRandomValue] = docToJoinScore; } fromSearcher.IndexReader.Dispose(); toSearcher.IndexReader.Dispose(); return(context); }
/// <summary> /// Entry point to the Compile application. /// <para/> /// This program takes any number of arguments: the first is the name of the /// desired stemming algorithm to use (a list is available in the package /// description) , all of the rest should be the path or paths to a file or /// files containing a stemmer table to compile. /// </summary> /// <param name="args">the command line arguments</param> public static void Main(string[] args) { if (args.Length < 1) { return; } // LUCENENET NOTE: This line does nothing in .NET // and also does nothing in Java...what? //args[0].ToUpperInvariant(); // Reads the first char of the first arg backward = args[0][0] == '-'; int qq = (backward) ? 1 : 0; bool storeorig = false; if (args[0][qq] == '0') { storeorig = true; qq++; } multi = args[0][qq] == 'M'; if (multi) { qq++; } // LUCENENET specific - reformatted with : string charset = SystemProperties.GetProperty("egothor:stemmer:charset", "UTF-8"); var stemmerTables = new List <string>(); // LUCENENET specific // command line argument overrides environment variable or default, if supplied for (int i = 1; i < args.Length; i++) { if ("-e".Equals(args[i], StringComparison.Ordinal) || "--encoding".Equals(args[i], StringComparison.Ordinal)) { charset = args[i]; } else { stemmerTables.Add(args[i]); } } char[] optimizer = new char[args[0].Length - qq]; for (int i = 0; i < optimizer.Length; i++) { optimizer[i] = args[0][qq + i]; } foreach (var stemmerTable in stemmerTables) { // System.out.println("[" + args[i] + "]"); Diff diff = new Diff(); //int stems = 0; // not used int words = 0; AllocTrie(); Console.WriteLine(stemmerTable); using (TextReader input = new StreamReader( new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset))) { string line; while ((line = input.ReadLine()) != null) { try { line = line.ToLowerInvariant(); StringTokenizer st = new StringTokenizer(line); st.MoveNext(); string stem = st.Current; if (storeorig) { trie.Add(stem, "-a"); words++; } while (st.MoveNext()) { string token = st.Current; if (token.Equals(stem, StringComparison.Ordinal) == false) { trie.Add(token, diff.Exec(token, stem)); words++; } } } catch (InvalidOperationException /*x*/) { // no base token (stem) on a line } } } Optimizer o = new Optimizer(); Optimizer2 o2 = new Optimizer2(); Lift l = new Lift(true); Lift e = new Lift(false); Gener g = new Gener(); for (int j = 0; j < optimizer.Length; j++) { string prefix; switch (optimizer[j]) { case 'G': trie = trie.Reduce(g); prefix = "G: "; break; case 'L': trie = trie.Reduce(l); prefix = "L: "; break; case 'E': trie = trie.Reduce(e); prefix = "E: "; break; case '2': trie = trie.Reduce(o2); prefix = "2: "; break; case '1': trie = trie.Reduce(o); prefix = "1: "; break; default: continue; } trie.PrintInfo(Console.Out, prefix + " "); } using DataOutputStream os = new DataOutputStream( new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write)); os.WriteUTF(args[0]); trie.Store(os); } }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new JCG.HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (Verbose) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (Verbose) { Console.Write(" " + docs[i][j]); } } if (Verbose) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (Verbose) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputEnumerator(docs)); // Build inefficient but hopefully correct model: List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (Verbose) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new JCG.Dictionary <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); if (!model.TryGetValue(token, out int?curCount) || curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (Verbose) { Console.WriteLine(" add '" + token + "' -> count=" + (model.TryGetValue(token, out int?count) ? (count.HasValue ? count.ToString() : "null") : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>(); double backoff = 1.0; seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (Verbose) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (Verbose) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (Verbose) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (Verbose) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; if (!gramCount.TryGetValue(context, out int?count) || count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (Verbose) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (Verbose) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (Verbose) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (Verbose) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (Verbose) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); if (model.TryGetValue(ngram, out int?count) && count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); tmp.Add(lr); if (Verbose) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); // LUCENENET: Converted end index to length } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (Verbose) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); // LUCENENET: Converted end index to length } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (Verbose) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); List <string[]> lines = new List <string[]>(400000); foreach (string file in csvFiles) { using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) { Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(new ComparerAnonymousHelper()); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }
public override void Run() { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread"); } while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTimeMS) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results { try { IndexSearcher s = outerInstance.GetCurrentSearcher(); try { // Verify 1) IW is correctly setting // diagnostics, and 2) segment warming for // merged segments is actually happening: foreach (AtomicReaderContext sub in s.IndexReader.Leaves) { SegmentReader segReader = (SegmentReader)sub.Reader; IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics; assertNotNull(diagnostics); diagnostics.TryGetValue("source", out string source); assertNotNull(source); if (source.Equals("merge", StringComparison.Ordinal)) { assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo, // LUCENENET: ConditionalWeakTable doesn't have ContainsKey, so we normalize to TryGetValue !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.TryGetValue(segReader.core, out BooleanRef _)); } } if (s.IndexReader.NumDocs > 0) { outerInstance.SmokeTestSearcher(s); Fields fields = MultiFields.GetFields(s.IndexReader); if (fields == null) { continue; } Terms terms = fields.GetTerms("body"); if (terms == null) { continue; } TermsEnum termsEnum = terms.GetEnumerator(); int seenTermCount = 0; int shift; int trigger; if (totTermCount < 30) { shift = 0; trigger = 1; } else { trigger = totTermCount / 30; shift = Random.Next(trigger); } while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTimeMS) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results { if (!termsEnum.MoveNext()) { totTermCount.Value = seenTermCount; break; } seenTermCount++; // search 30 terms if ((seenTermCount + shift) % trigger == 0) { //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString()); //} totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", termsEnum.Term)))); } } //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + ": search done"); //} } } finally { outerInstance.ReleaseSearcher(s); } } catch (Exception t) when(t.IsThrowable()) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); outerInstance.m_failed.Value = (true); Console.WriteLine(t.ToString()); throw RuntimeException.Create(t); } } }
public void TestRandom() { int numberOfRuns = TestUtil.NextInt32(Random, 3, 6); for (int iter = 0; iter < numberOfRuns; iter++) { if (VERBOSE) { Console.WriteLine(string.Format("TEST: iter={0} total={1}", iter, numberOfRuns)); } int numDocs = TestUtil.NextInt32(Random, 100, 1000) * RANDOM_MULTIPLIER; int numGroups = TestUtil.NextInt32(Random, 1, numDocs); if (VERBOSE) { Console.WriteLine("TEST: numDocs=" + numDocs + " numGroups=" + numGroups); } List <BytesRef> groups = new List <BytesRef>(); for (int i = 0; i < numGroups; i++) { string randomValue; do { // B/c of DV based impl we can't see the difference between an empty string and a null value. // For that reason we don't generate empty string groups. randomValue = TestUtil.RandomRealisticUnicodeString(Random); } while ("".Equals(randomValue, StringComparison.Ordinal)); groups.Add(new BytesRef(randomValue)); } string[] contentStrings = new string[TestUtil.NextInt32(Random, 2, 20)]; if (VERBOSE) { Console.WriteLine("TEST: create fake content"); } for (int contentIDX = 0; contentIDX < contentStrings.Length; contentIDX++) { StringBuilder sb = new StringBuilder(); sb.append("real").append(Random.nextInt(3)).append(' '); int fakeCount = Random.nextInt(10); for (int fakeIDX = 0; fakeIDX < fakeCount; fakeIDX++) { sb.append("fake "); } contentStrings[contentIDX] = sb.toString(); if (VERBOSE) { Console.WriteLine(" content=" + sb.toString()); } } Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); bool preFlex = "Lucene3x".Equals(w.IndexWriter.Config.Codec.Name, StringComparison.Ordinal); bool canUseIDV = !preFlex; DocValuesType valueType = vts[Random.nextInt(vts.Length)]; Document doc = new Document(); Document docNoGroup = new Document(); Field group = NewStringField("group", "", Field.Store.NO); doc.Add(group); Field valuesField = null; if (canUseIDV) { switch (valueType) { case DocValuesType.BINARY: valuesField = new BinaryDocValuesField("group_dv", new BytesRef()); break; case DocValuesType.SORTED: valuesField = new SortedDocValuesField("group_dv", new BytesRef()); break; default: fail("unhandled type"); break; } doc.Add(valuesField); } Field sort1 = NewStringField("sort1", "", Field.Store.NO); doc.Add(sort1); docNoGroup.Add(sort1); Field sort2 = NewStringField("sort2", "", Field.Store.NO); doc.Add(sort2); docNoGroup.Add(sort2); Field sort3 = NewStringField("sort3", "", Field.Store.NO); doc.Add(sort3); docNoGroup.Add(sort3); Field content = NewTextField("content", "", Field.Store.NO); doc.Add(content); docNoGroup.Add(content); Int32Field id = new Int32Field("id", 0, Field.Store.NO); doc.Add(id); docNoGroup.Add(id); GroupDoc[] groupDocs = new GroupDoc[numDocs]; for (int i = 0; i < numDocs; i++) { BytesRef groupValue; if (Random.nextInt(24) == 17) { // So we test the "doc doesn't have the group'd // field" case: groupValue = null; } else { groupValue = groups[Random.nextInt(groups.size())]; } GroupDoc groupDoc = new GroupDoc( i, groupValue, groups[Random.nextInt(groups.size())], groups[Random.nextInt(groups.size())], new BytesRef(string.Format(CultureInfo.InvariantCulture, "{0:D5}", i)), contentStrings[Random.nextInt(contentStrings.Length)] ); if (VERBOSE) { Console.WriteLine(" doc content=" + groupDoc.content + " id=" + i + " group=" + (groupDoc.group == null ? "null" : groupDoc.group.Utf8ToString()) + " sort1=" + groupDoc.sort1.Utf8ToString() + " sort2=" + groupDoc.sort2.Utf8ToString() + " sort3=" + groupDoc.sort3.Utf8ToString()); } groupDocs[i] = groupDoc; if (groupDoc.group != null) { group.SetStringValue(groupDoc.group.Utf8ToString()); if (canUseIDV) { valuesField.SetBytesValue(new BytesRef(groupDoc.group.Utf8ToString())); } } sort1.SetStringValue(groupDoc.sort1.Utf8ToString()); sort2.SetStringValue(groupDoc.sort2.Utf8ToString()); sort3.SetStringValue(groupDoc.sort3.Utf8ToString()); content.SetStringValue(groupDoc.content); id.SetInt32Value(groupDoc.id); if (groupDoc.group == null) { w.AddDocument(docNoGroup); } else { w.AddDocument(doc); } } DirectoryReader r = w.GetReader(); w.Dispose(); // NOTE: intentional but temporary field cache insanity! FieldCache.Int32s docIdToFieldId = FieldCache.DEFAULT.GetInt32s(SlowCompositeReaderWrapper.Wrap(r), "id", false); int[] fieldIdToDocID = new int[numDocs]; for (int i = 0; i < numDocs; i++) { int fieldId = docIdToFieldId.Get(i); fieldIdToDocID[fieldId] = i; } try { IndexSearcher s = NewSearcher(r); if (typeof(SlowCompositeReaderWrapper).GetTypeInfo().IsAssignableFrom(s.IndexReader.GetType())) { canUseIDV = false; } else { canUseIDV = !preFlex; } for (int contentID = 0; contentID < 3; contentID++) { ScoreDoc[] hits = s.Search(new TermQuery(new Term("content", "real" + contentID)), numDocs).ScoreDocs; foreach (ScoreDoc hit in hits) { GroupDoc gd = groupDocs[docIdToFieldId.Get(hit.Doc)]; assertTrue(gd.score == 0.0); gd.score = hit.Score; int docId = gd.id; assertEquals(docId, docIdToFieldId.Get(hit.Doc)); } } foreach (GroupDoc gd in groupDocs) { assertTrue(gd.score != 0.0); } for (int searchIter = 0; searchIter < 100; searchIter++) { if (VERBOSE) { Console.WriteLine("TEST: searchIter=" + searchIter); } string searchTerm = "real" + Random.nextInt(3); bool sortByScoreOnly = Random.nextBoolean(); Sort sortWithinGroup = GetRandomSort(sortByScoreOnly); AbstractAllGroupHeadsCollector allGroupHeadsCollector = CreateRandomCollector("group", sortWithinGroup, canUseIDV, valueType); s.Search(new TermQuery(new Term("content", searchTerm)), allGroupHeadsCollector); int[] expectedGroupHeads = CreateExpectedGroupHeads(searchTerm, groupDocs, sortWithinGroup, sortByScoreOnly, fieldIdToDocID); int[] actualGroupHeads = allGroupHeadsCollector.RetrieveGroupHeads(); // The actual group heads contains Lucene ids. Need to change them into our id value. for (int i = 0; i < actualGroupHeads.Length; i++) { actualGroupHeads[i] = docIdToFieldId.Get(actualGroupHeads[i]); } // Allows us the easily iterate and assert the actual and expected results. Array.Sort(expectedGroupHeads); Array.Sort(actualGroupHeads); if (VERBOSE) { Console.WriteLine("Collector: " + allGroupHeadsCollector.GetType().Name); Console.WriteLine("Sort within group: " + sortWithinGroup); Console.WriteLine("Num group: " + numGroups); Console.WriteLine("Num doc: " + numDocs); Console.WriteLine("\n=== Expected: \n"); foreach (int expectedDocId in expectedGroupHeads) { GroupDoc expectedGroupDoc = groupDocs[expectedDocId]; string expectedGroup = expectedGroupDoc.group == null ? null : expectedGroupDoc.group.Utf8ToString(); Console.WriteLine( string.Format(CultureInfo.InvariantCulture, "Group:{0,10} score{1:0.0#######,5} Sort1:{2,10} Sort2:{3,10} Sort3:{4,10} doc:{5,10}", expectedGroup, expectedGroupDoc.score, expectedGroupDoc.sort1.Utf8ToString(), expectedGroupDoc.sort2.Utf8ToString(), expectedGroupDoc.sort3.Utf8ToString(), expectedDocId) ); } Console.WriteLine("\n=== Actual: \n"); foreach (int actualDocId in actualGroupHeads) { GroupDoc actualGroupDoc = groupDocs[actualDocId]; string actualGroup = actualGroupDoc.group == null ? null : actualGroupDoc.group.Utf8ToString(); Console.WriteLine( string.Format(CultureInfo.InvariantCulture, "Group:{0,10} score{1:0.0#######,5} Sort1:{2,10} Sort2:{3,10} Sort3:{4,10} doc:{5,10}", actualGroup, actualGroupDoc.score, actualGroupDoc.sort1.Utf8ToString(), actualGroupDoc.sort2.Utf8ToString(), actualGroupDoc.sort3.Utf8ToString(), actualDocId) ); } Console.WriteLine("\n==================================================================================="); } assertArrayEquals(expectedGroupHeads, actualGroupHeads); } } finally { QueryUtils.PurgeFieldCache(r); } r.Dispose(); dir.Dispose(); } }
public override void Run() { // TODO: would be better if this were cross thread, so that we make sure one thread deleting anothers added docs works: IList <string> toDeleteIDs = new JCG.List <string>(); IList <SubDocs> toDeleteSubDocs = new JCG.List <SubDocs>(); while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTime && !outerInstance.m_failed) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results { try { // Occasional longish pause if running // nightly if (LuceneTestCase.TestNightly && Random.Next(6) == 3) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": now long sleep"); } //Thread.Sleep(TestUtil.NextInt32(Random, 50, 500)); // LUCENENET specific - Reduced amount of pause to keep the total // Nightly test time under 1 hour Thread.Sleep(TestUtil.NextInt32(Random, 50, 250)); } // Rate limit ingest rate: if (Random.Next(7) == 5) { Thread.Sleep(TestUtil.NextInt32(Random, 1, 10)); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": done sleep"); } } Document doc = docs.NextDoc(); if (doc == null) { break; } // Maybe add randomly named field string addedField; if (Random.NextBoolean()) { addedField = "extra" + Random.Next(40); doc.Add(NewTextField(addedField, "a random field", Field.Store.YES)); } else { addedField = null; } if (Random.NextBoolean()) { if (Random.NextBoolean()) { // Add/update doc block: string packID; SubDocs delSubDocs; if (toDeleteSubDocs.Count > 0 && Random.NextBoolean()) { delSubDocs = toDeleteSubDocs[Random.Next(toDeleteSubDocs.Count)]; if (Debugging.AssertsEnabled) { Debugging.Assert(!delSubDocs.Deleted); } toDeleteSubDocs.Remove(delSubDocs); // Update doc block, replacing prior packID packID = delSubDocs.PackID; } else { delSubDocs = null; // Add doc block, using new packID packID = outerInstance.m_packCount.GetAndIncrement().ToString(CultureInfo.InvariantCulture); } Field packIDField = NewStringField("packID", packID, Field.Store.YES); IList <string> docIDs = new JCG.List <string>(); SubDocs subDocs = new SubDocs(packID, docIDs); IList <Document> docsList = new JCG.List <Document>(); allSubDocs.Enqueue(subDocs); doc.Add(packIDField); docsList.Add(TestUtil.CloneDocument(doc)); docIDs.Add(doc.Get("docid")); int maxDocCount = TestUtil.NextInt32(Random, 1, 10); while (docsList.Count < maxDocCount) { doc = docs.NextDoc(); if (doc == null) { break; } docsList.Add(TestUtil.CloneDocument(doc)); docIDs.Add(doc.Get("docid")); } outerInstance.m_addCount.AddAndGet(docsList.Count); Term packIDTerm = new Term("packID", packID); if (delSubDocs != null) { delSubDocs.Deleted = true; delIDs.UnionWith(delSubDocs.SubIDs); outerInstance.m_delCount.AddAndGet(delSubDocs.SubIDs.Count); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": update pack packID=" + delSubDocs.PackID + " count=" + docsList.Count + " docs=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", docIDs)); } outerInstance.UpdateDocuments(packIDTerm, docsList); } else { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": add pack packID=" + packID + " count=" + docsList.Count + " docs=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", docIDs)); } outerInstance.AddDocuments(packIDTerm, docsList); } doc.RemoveField("packID"); if (Random.Next(5) == 2) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": buffer del id:" + packID); } toDeleteSubDocs.Add(subDocs); } } else { // Add single doc string docid = doc.Get("docid"); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": add doc docid:" + docid); } outerInstance.AddDocument(new Term("docid", docid), doc); outerInstance.m_addCount.GetAndIncrement(); if (Random.Next(5) == 3) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": buffer del id:" + doc.Get("docid")); } toDeleteIDs.Add(docid); } } } else { // Update single doc, but we never re-use // and ID so the delete will never // actually happen: if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": update doc id:" + doc.Get("docid")); } string docid = doc.Get("docid"); outerInstance.UpdateDocument(new Term("docid", docid), doc); outerInstance.m_addCount.GetAndIncrement(); if (Random.Next(5) == 3) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": buffer del id:" + doc.Get("docid")); } toDeleteIDs.Add(docid); } } if (Random.Next(30) == 17) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": apply " + toDeleteIDs.Count + " deletes"); } foreach (string id in toDeleteIDs) { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": del term=id:" + id); } outerInstance.DeleteDocuments(new Term("docid", id)); } int count = outerInstance.m_delCount.AddAndGet(toDeleteIDs.Count); if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": tot " + count + " deletes"); } delIDs.UnionWith(toDeleteIDs); toDeleteIDs.Clear(); foreach (SubDocs subDocs in toDeleteSubDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(!subDocs.Deleted); } delPackIDs.Add(subDocs.PackID); outerInstance.DeleteDocuments(new Term("packID", subDocs.PackID)); subDocs.Deleted = true; if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": del subs: " + subDocs.SubIDs + " packID=" + subDocs.PackID); } delIDs.UnionWith(subDocs.SubIDs); outerInstance.m_delCount.AddAndGet(subDocs.SubIDs.Count); } toDeleteSubDocs.Clear(); } if (addedField != null) { doc.RemoveField(addedField); } } catch (Exception t) when(t.IsThrowable()) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); Console.WriteLine(t.ToString()); Console.Write(t.StackTrace); outerInstance.m_failed.Value = (true); throw RuntimeException.Create(t); } } if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": indexing done"); } outerInstance.DoAfterIndexingThreadDone(); }
public virtual void IndexDoc() { Document d = new Document(); FieldType customType1 = new FieldType(TextField.TYPE_STORED); customType1.IsTokenized = false; customType1.OmitNorms = true; List <Field> fields = new List <Field>(); string idString = IdString; Field idField = NewField("id", idString, customType1); fields.Add(idField); int nFields = NextInt(MaxFields); for (int i = 0; i < nFields; i++) { FieldType customType = new FieldType(); switch (NextInt(4)) { case 0: break; case 1: customType.StoreTermVectors = true; break; case 2: customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; break; case 3: customType.StoreTermVectors = true; customType.StoreTermVectorOffsets = true; break; } switch (NextInt(4)) { case 0: customType.IsStored = true; customType.OmitNorms = true; customType.IsIndexed = true; fields.Add(NewField("f" + NextInt(100), GetString(1), customType)); break; case 1: customType.IsIndexed = true; customType.IsTokenized = true; fields.Add(NewField("f" + NextInt(100), GetString(0), customType)); break; case 2: customType.IsStored = true; customType.StoreTermVectors = false; customType.StoreTermVectorOffsets = false; customType.StoreTermVectorPositions = false; fields.Add(NewField("f" + NextInt(100), GetString(0), customType)); break; case 3: customType.IsStored = true; customType.IsIndexed = true; customType.IsTokenized = true; fields.Add(NewField("f" + NextInt(100), GetString(BigFieldSize), customType)); break; } } if (SameFieldOrder) { fields.Sort(fieldNameComparer); } else { // random placement of id field also fields.Swap(NextInt(fields.Count), 0); } for (int i = 0; i < fields.Count; i++) { d.Add(fields[i]); } if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": indexing id:" + idString); } w.UpdateDocument(new Term("id", idString), d); //System.out.println(Thread.currentThread().getName() + ": indexing "+d); Docs[idString] = d; }
public virtual void VerifyEquals(DirectoryReader r1, DirectoryReader r2, string idField) { if (VERBOSE) { Console.WriteLine("\nr1 docs:"); PrintDocs(r1); Console.WriteLine("\nr2 docs:"); PrintDocs(r2); } if (r1.NumDocs != r2.NumDocs) { Debug.Assert(false, "r1.NumDocs=" + r1.NumDocs + " vs r2.NumDocs=" + r2.NumDocs); } bool hasDeletes = !(r1.MaxDoc == r2.MaxDoc && r1.NumDocs == r1.MaxDoc); int[] r2r1 = new int[r2.MaxDoc]; // r2 id to r1 id mapping // create mapping from id2 space to id2 based on idField Fields f1 = MultiFields.GetFields(r1); if (f1 == null) { // make sure r2 is empty Assert.IsNull(MultiFields.GetFields(r2)); return; } Terms terms1 = f1.GetTerms(idField); if (terms1 == null) { Assert.IsTrue(MultiFields.GetFields(r2) == null || MultiFields.GetFields(r2).GetTerms(idField) == null); return; } TermsEnum termsEnum = terms1.GetIterator(null); IBits liveDocs1 = MultiFields.GetLiveDocs(r1); IBits liveDocs2 = MultiFields.GetLiveDocs(r2); Fields fields = MultiFields.GetFields(r2); if (fields == null) { // make sure r1 is in fact empty (eg has only all // deleted docs): IBits liveDocs = MultiFields.GetLiveDocs(r1); DocsEnum docs = null; while (termsEnum.Next() != null) { docs = TestUtil.Docs(Random, termsEnum, liveDocs, docs, DocsFlags.NONE); while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.Fail("r1 is not empty but r2 is"); } } return; } Terms terms2 = fields.GetTerms(idField); TermsEnum termsEnum2 = terms2.GetIterator(null); DocsEnum termDocs1 = null; DocsEnum termDocs2 = null; while (true) { BytesRef term = termsEnum.Next(); //System.out.println("TEST: match id term=" + term); if (term == null) { break; } termDocs1 = TestUtil.Docs(Random, termsEnum, liveDocs1, termDocs1, DocsFlags.NONE); if (termsEnum2.SeekExact(term)) { termDocs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, termDocs2, DocsFlags.NONE); } else { termDocs2 = null; } if (termDocs1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS) { // this doc is deleted and wasn't replaced Assert.IsTrue(termDocs2 == null || termDocs2.NextDoc() == DocIdSetIterator.NO_MORE_DOCS); continue; } int id1 = termDocs1.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs1.NextDoc()); Assert.IsTrue(termDocs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int id2 = termDocs2.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs2.NextDoc()); r2r1[id2] = id1; // verify stored fields are equivalent try { VerifyEquals(r1.Document(id1), r2.Document(id2)); } catch (Exception /*t*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); Console.WriteLine(" d1=" + r1.Document(id1)); Console.WriteLine(" d2=" + r2.Document(id2)); throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } try { // verify term vectors are equivalent VerifyEquals(r1.GetTermVectors(id1), r2.GetTermVectors(id2)); } catch (Exception /*e*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); Fields tv1 = r1.GetTermVectors(id1); Console.WriteLine(" d1=" + tv1); if (tv1 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv1) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv1.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetIterator(null); BytesRef term2; while ((term2 = termsEnum3.Next()) != null) { Console.WriteLine(" " + term2.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } Fields tv2 = r2.GetTermVectors(id2); Console.WriteLine(" d2=" + tv2); if (tv2 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv2) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv2.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetIterator(null); BytesRef term2; while ((term2 = termsEnum3.Next()) != null) { Console.WriteLine(" " + term2.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } //System.out.println("TEST: done match id"); // Verify postings //System.out.println("TEST: create te1"); Fields fields1 = MultiFields.GetFields(r1); IEnumerator <string> fields1Enum = fields1.GetEnumerator(); Fields fields2 = MultiFields.GetFields(r2); IEnumerator <string> fields2Enum = fields2.GetEnumerator(); string field1 = null, field2 = null; TermsEnum termsEnum1 = null; termsEnum2 = null; DocsEnum docs1 = null, docs2 = null; // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs]; long[] info2 = new long[r2.NumDocs]; for (; ;) { BytesRef term1 = null, term2 = null; // iterate until we get some docs int len1; for (; ;) { len1 = 0; if (termsEnum1 == null) { if (!fields1Enum.MoveNext()) { break; } field1 = fields1Enum.Current; Terms terms = fields1.GetTerms(field1); if (terms == null) { continue; } termsEnum1 = terms.GetIterator(null); } term1 = termsEnum1.Next(); if (term1 == null) { // no more terms in this field termsEnum1 = null; continue; } //System.out.println("TEST: term1=" + term1); docs1 = TestUtil.Docs(Random, termsEnum1, liveDocs1, docs1, DocsFlags.FREQS); while (docs1.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = docs1.DocID; int f = docs1.Freq; info1[len1] = (((long)d) << 32) | (uint)f; len1++; } if (len1 > 0) { break; } } // iterate until we get some docs int len2; for (; ;) { len2 = 0; if (termsEnum2 == null) { if (!fields2Enum.MoveNext()) { break; } field2 = fields2Enum.Current; Terms terms = fields2.GetTerms(field2); if (terms == null) { continue; } termsEnum2 = terms.GetIterator(null); } term2 = termsEnum2.Next(); if (term2 == null) { // no more terms in this field termsEnum2 = null; continue; } //System.out.println("TEST: term1=" + term1); docs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, docs2, DocsFlags.FREQS); while (docs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = r2r1[docs2.DocID]; int f = docs2.Freq; info2[len2] = (((long)d) << 32) | (uint)f; len2++; } if (len2 > 0) { break; } } Assert.AreEqual(len1, len2); if (len1 == 0) // no more terms { break; } Assert.AreEqual(field1, field2); Assert.IsTrue(term1.BytesEquals(term2)); if (!hasDeletes) { Assert.AreEqual(termsEnum1.DocFreq, termsEnum2.DocFreq); } Assert.AreEqual(term1, term2, "len1=" + len1 + " len2=" + len2 + " deletes?=" + hasDeletes); // sort info2 to get it into ascending docid Array.Sort(info2, 0, len2); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i], "i=" + i + " len=" + len1 + " d1=" + ((long)((ulong)info1[i] >> 32)) + " f1=" + (info1[i] & int.MaxValue) + " d2=" + ((long)((ulong)info2[i] >> 32)) + " f2=" + (info2[i] & int.MaxValue) + " field=" + field1 + " term=" + term1.Utf8ToString()); } } }
public override void BeforeClass() { base.BeforeClass(); ANALYZER = new MockAnalyzer(Random); qp = new StandardQueryParser(ANALYZER); IDictionary <string, /*Number*/ object> randomNumberMap = new JCG.Dictionary <string, object>(); /*SimpleDateFormat*/ string dateFormat; long randomDate; bool dateFormatSanityCheckPass; int count = 0; do { if (count > 100) { fail("This test has problems to find a sane random DateFormat/NumberFormat. Stopped trying after 100 iterations."); } dateFormatSanityCheckPass = true; LOCALE = RandomCulture(Random); TIMEZONE = RandomTimeZone(Random); DATE_STYLE = randomDateStyle(Random); TIME_STYLE = randomDateStyle(Random); //// assumes localized date pattern will have at least year, month, day, //// hour, minute //dateFormat = (SimpleDateFormat)DateFormat.getDateTimeInstance( // DATE_STYLE, TIME_STYLE, LOCALE); //// not all date patterns includes era, full year, timezone and second, //// so we add them here //dateFormat.applyPattern(dateFormat.toPattern() + " G s Z yyyy"); //dateFormat.setTimeZone(TIMEZONE); // assumes localized date pattern will have at least year, month, day, // hour, minute DATE_FORMAT = new NumberDateFormat(DATE_STYLE, TIME_STYLE, LOCALE) { TimeZone = TIMEZONE }; // not all date patterns includes era, full year, timezone and second, // so we add them here DATE_FORMAT.SetDateFormat(DATE_FORMAT.GetDateFormat() + " g s z yyyy"); dateFormat = DATE_FORMAT.GetDateFormat(); do { randomDate = Random.nextLong(); // prune date value so it doesn't pass in insane values to some // calendars. randomDate = randomDate % 3400000000000L; // truncate to second randomDate = (randomDate / 1000L) * 1000L; // only positive values randomDate = Math.Abs(randomDate); } while (randomDate == 0L); dateFormatSanityCheckPass &= checkDateFormatSanity(dateFormat, randomDate); dateFormatSanityCheckPass &= checkDateFormatSanity(dateFormat, 0); dateFormatSanityCheckPass &= checkDateFormatSanity(dateFormat, -randomDate); count++; } while (!dateFormatSanityCheckPass); //NUMBER_FORMAT = NumberFormat.getNumberInstance(LOCALE); //NUMBER_FORMAT.setMaximumFractionDigits((Random().nextInt() & 20) + 1); //NUMBER_FORMAT.setMinimumFractionDigits((Random().nextInt() & 20) + 1); //NUMBER_FORMAT.setMaximumIntegerDigits((Random().nextInt() & 20) + 1); //NUMBER_FORMAT.setMinimumIntegerDigits((Random().nextInt() & 20) + 1); NUMBER_FORMAT = new NumberFormat(LOCALE); double randomDouble; long randomLong; int randomInt; float randomFloat; while ((randomLong = Convert.ToInt64(NormalizeNumber(Math.Abs(Random.nextLong())) )) == 0L) { ; } while ((randomDouble = Convert.ToDouble(NormalizeNumber(Math.Abs(Random.NextDouble())) )) == 0.0) { ; } while ((randomFloat = Convert.ToSingle(NormalizeNumber(Math.Abs(Random.nextFloat())) )) == 0.0f) { ; } while ((randomInt = Convert.ToInt32(NormalizeNumber(Math.Abs(Random.nextInt())))) == 0) { ; } randomNumberMap.Put(NumericType.INT64.ToString(), randomLong); randomNumberMap.Put(NumericType.INT32.ToString(), randomInt); randomNumberMap.Put(NumericType.SINGLE.ToString(), randomFloat); randomNumberMap.Put(NumericType.DOUBLE.ToString(), randomDouble); randomNumberMap.Put(DATE_FIELD_NAME, randomDate); RANDOM_NUMBER_MAP = randomNumberMap.AsReadOnly(); directory = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)) .SetMaxBufferedDocs(TestUtil.NextInt32(Random, 50, 1000)) .SetMergePolicy(NewLogMergePolicy())); Document doc = new Document(); IDictionary <String, NumericConfig> numericConfigMap = new JCG.Dictionary <String, NumericConfig>(); IDictionary <String, Field> numericFieldMap = new JCG.Dictionary <String, Field>(); qp.NumericConfigMap = (numericConfigMap); foreach (NumericType type in Enum.GetValues(typeof(NumericType))) { if (type == NumericType.NONE) { continue; } numericConfigMap.Put(type.ToString(), new NumericConfig(PRECISION_STEP, NUMBER_FORMAT, type)); FieldType ft2 = new FieldType(Int32Field.TYPE_NOT_STORED); ft2.NumericType = (type); ft2.IsStored = (true); ft2.NumericPrecisionStep = (PRECISION_STEP); ft2.Freeze(); Field field; switch (type) { case NumericType.INT32: field = new Int32Field(type.ToString(), 0, ft2); break; case NumericType.SINGLE: field = new SingleField(type.ToString(), 0.0f, ft2); break; case NumericType.INT64: field = new Int64Field(type.ToString(), 0L, ft2); break; case NumericType.DOUBLE: field = new DoubleField(type.ToString(), 0.0, ft2); break; default: fail(); field = null; break; } numericFieldMap.Put(type.ToString(), field); doc.Add(field); } numericConfigMap.Put(DATE_FIELD_NAME, new NumericConfig(PRECISION_STEP, DATE_FORMAT, NumericType.INT64)); FieldType ft = new FieldType(Int64Field.TYPE_NOT_STORED); ft.IsStored = (true); ft.NumericPrecisionStep = (PRECISION_STEP); Int64Field dateField = new Int64Field(DATE_FIELD_NAME, 0L, ft); numericFieldMap.Put(DATE_FIELD_NAME, dateField); doc.Add(dateField); foreach (NumberType numberType in Enum.GetValues(typeof(NumberType))) { setFieldValues(numberType, numericFieldMap); if (VERBOSE) { Console.WriteLine("Indexing document: " + doc); } writer.AddDocument(doc); } reader = writer.GetReader(); searcher = NewSearcher(reader); writer.Dispose(); }
private void DoTestSeekDoesNotExist(Random r, int numField, IList <Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); if (Verbose) { Console.WriteLine("TEST: top random seeks"); } { int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // seek to random spot string field = ("f" + r.Next(numField)).Intern(); Term tx = new Term(field, GetRandomString(r)); int spot = Array.BinarySearch(fieldTermsArray, tx); if (spot < 0) { if (Verbose) { Console.WriteLine("TEST: non-exist seek to " + field + ":" + UnicodeUtil.ToHexString(tx.Text())); } // term does not exist: TermsEnum te; if (!tes.TryGetValue(field, out te)) { te = MultiFields.GetTerms(reader, field).GetIterator(null); tes[field] = te; } if (Verbose) { Console.WriteLine(" got enum"); } spot = -spot - 1; if (spot == fieldTerms.Count || !fieldTerms[spot].Field.Equals(field, StringComparison.Ordinal)) { Assert.AreEqual(TermsEnum.SeekStatus.END, te.SeekCeil(tx.Bytes)); } else { Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, te.SeekCeil(tx.Bytes)); if (Verbose) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(te.Term.Utf8ToString())); Console.WriteLine(" exp term=" + UnicodeUtil.ToHexString(fieldTerms[spot].Text())); } Assert.AreEqual(fieldTerms[spot].Bytes, te.Term); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } Term term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsNull(te.Next()); break; } else { BytesRef t = te.Next(); if (Verbose) { Console.WriteLine(" got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text().ToString())); } Assert.AreEqual(term.Bytes, t); } } } } } } }
public override void BeforeClass() { base.BeforeClass(); IEnumerable <Type> analysisClasses = typeof(StandardAnalyzer).Assembly.GetTypes() .Where(c => { var typeInfo = c; return(!typeInfo.IsAbstract && typeInfo.IsPublic && !typeInfo.IsInterface && typeInfo.IsClass && (typeInfo.GetCustomAttribute <ObsoleteAttribute>() == null) && (typeInfo.IsSubclassOf(typeof(Tokenizer)) || typeInfo.IsSubclassOf(typeof(TokenFilter)) || typeInfo.IsSubclassOf(typeof(CharFilter)))); }) .ToArray(); tokenizers = new List <ConstructorInfo>(); tokenfilters = new List <ConstructorInfo>(); charfilters = new List <ConstructorInfo>(); foreach (Type c in analysisClasses) { foreach (ConstructorInfo ctor in c.GetConstructors()) { if (ctor.GetCustomAttribute <ObsoleteAttribute>() != null || (brokenConstructors.ContainsKey(ctor) && brokenConstructors[ctor] == ALWAYS)) { continue; } var typeInfo = c; if (typeInfo.IsSubclassOf(typeof(Tokenizer))) { assertTrue(ctor.ToString() + " has unsupported parameter types", allowedTokenizerArgs.containsAll(ctor.GetParameters().Select(p => p.ParameterType).ToArray())); tokenizers.Add(ctor); } else if (typeInfo.IsSubclassOf(typeof(TokenFilter))) { assertTrue(ctor.ToString() + " has unsupported parameter types", allowedTokenFilterArgs.containsAll(ctor.GetParameters().Select(p => p.ParameterType).ToArray())); tokenfilters.Add(ctor); } else if (typeInfo.IsSubclassOf(typeof(CharFilter))) { assertTrue(ctor.ToString() + " has unsupported parameter types", allowedCharFilterArgs.containsAll(ctor.GetParameters().Select(p => p.ParameterType).ToArray())); charfilters.Add(ctor); } else { fail("Cannot get here"); } } } IComparer <ConstructorInfo> ctorComp = Comparer <ConstructorInfo> .Create((arg0, arg1) => arg0.ToString().CompareToOrdinal(arg1.ToString())); tokenizers.Sort(ctorComp); tokenfilters.Sort(ctorComp); charfilters.Sort(ctorComp); if (Verbose) { Console.WriteLine("tokenizers = " + tokenizers); Console.WriteLine("tokenfilters = " + tokenfilters); Console.WriteLine("charfilters = " + charfilters); } }
public virtual void TestSurrogatesOrder() { Directory dir = NewDirectory(); var config = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); config.Codec = new PreFlexRWCodec(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, config); int numField = TestUtil.NextInt32(Random, 2, 5); int uniqueTermCount = 0; int tc = 0; var fieldTerms = new List <Term>(); for (int f = 0; f < numField; f++) { string field = "f" + f; int numTerms = AtLeast(200); ISet <string> uniqueTerms = new JCG.HashSet <string>(); for (int i = 0; i < numTerms; i++) { string term = GetRandomString(Random) + "_ " + (tc++); uniqueTerms.Add(term); fieldTerms.Add(new Term(field, term)); Documents.Document doc = new Documents.Document(); doc.Add(NewStringField(field, term, Field.Store.NO)); w.AddDocument(doc); } uniqueTermCount += uniqueTerms.Count; } IndexReader reader = w.GetReader(); if (Verbose) { fieldTerms.Sort(termAsUTF16Comparer); Console.WriteLine("\nTEST: UTF16 order"); foreach (Term t in fieldTerms) { Console.WriteLine(" " + ToHexString(t)); } } // sorts in code point order: fieldTerms.Sort(); if (Verbose) { Console.WriteLine("\nTEST: codepoint order"); foreach (Term t in fieldTerms) { Console.WriteLine(" " + ToHexString(t)); } } Term[] fieldTermsArray = fieldTerms.ToArray(); //SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); //FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); //Assert.IsNotNull(fields); DoTestStraightEnum(fieldTerms, reader, uniqueTermCount); DoTestSeekExists(Random, fieldTerms, reader); DoTestSeekDoesNotExist(Random, numField, fieldTerms, fieldTermsArray, reader); reader.Dispose(); w.Dispose(); dir.Dispose(); }
private void ExecuteRandomJoin(bool multipleValuesPerDocument, int maxIndexIter, int maxSearchIter, int numberOfDocumentsToIndex) { for (int indexIter = 1; indexIter <= maxIndexIter; indexIter++) { if (VERBOSE) { Console.WriteLine("indexIter=" + indexIter); } Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.KEYWORD, false)) .SetMergePolicy(NewLogMergePolicy())); bool scoreDocsInOrder = TestJoinUtil.Random.NextBoolean(); IndexIterationContext context = CreateContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument, scoreDocsInOrder); IndexReader topLevelReader = w.GetReader(); w.Dispose(); for (int searchIter = 1; searchIter <= maxSearchIter; searchIter++) { if (VERBOSE) { Console.WriteLine("searchIter=" + searchIter); } IndexSearcher indexSearcher = NewSearcher(topLevelReader); int r = Random.Next(context.RandomUniqueValues.Length); bool from = context.RandomFrom[r]; string randomValue = context.RandomUniqueValues[r]; FixedBitSet expectedResult = CreateExpectedResult(randomValue, from, indexSearcher.IndexReader, context); Query actualQuery = new TermQuery(new Term("value", randomValue)); if (VERBOSE) { Console.WriteLine("actualQuery=" + actualQuery); } var scoreModeLength = Enum.GetNames(typeof(ScoreMode)).Length; ScoreMode scoreMode = (ScoreMode)Random.Next(scoreModeLength); if (VERBOSE) { Console.WriteLine("scoreMode=" + scoreMode); } Query joinQuery; if (from) { joinQuery = JoinUtil.CreateJoinQuery("from", multipleValuesPerDocument, "to", actualQuery, indexSearcher, scoreMode); } else { joinQuery = JoinUtil.CreateJoinQuery("to", multipleValuesPerDocument, "from", actualQuery, indexSearcher, scoreMode); } if (VERBOSE) { Console.WriteLine("joinQuery=" + joinQuery); } // Need to know all documents that have matches. TopDocs doesn't give me that and then I'd be also testing TopDocsCollector... FixedBitSet actualResult = new FixedBitSet(indexSearcher.IndexReader.MaxDoc); TopScoreDocCollector topScoreDocCollector = TopScoreDocCollector.Create(10, false); indexSearcher.Search(joinQuery, new CollectorAnonymousInnerClassHelper2(this, scoreDocsInOrder, context, actualResult, topScoreDocCollector)); // Asserting bit set... if (VERBOSE) { Console.WriteLine("expected cardinality:" + expectedResult.Cardinality()); DocIdSetIterator iterator = expectedResult.GetIterator(); for (int doc = iterator.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.NextDoc()) { Console.WriteLine(string.Format("Expected doc[{0}] with id value {1}", doc, indexSearcher.Doc(doc).Get("id"))); } Console.WriteLine("actual cardinality:" + actualResult.Cardinality()); iterator = actualResult.GetIterator(); for (int doc = iterator.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.NextDoc()) { Console.WriteLine(string.Format("Actual doc[{0}] with id value {1}", doc, indexSearcher.Doc(doc).Get("id"))); } } assertEquals(expectedResult, actualResult); // Asserting TopDocs... TopDocs expectedTopDocs = CreateExpectedTopDocs(randomValue, from, scoreMode, context); TopDocs actualTopDocs = topScoreDocCollector.GetTopDocs(); assertEquals(expectedTopDocs.TotalHits, actualTopDocs.TotalHits); assertEquals(expectedTopDocs.ScoreDocs.Length, actualTopDocs.ScoreDocs.Length); if (scoreMode == ScoreMode.None) { continue; } assertEquals(expectedTopDocs.MaxScore, actualTopDocs.MaxScore, 0.0f); for (int i = 0; i < expectedTopDocs.ScoreDocs.Length; i++) { if (VERBOSE) { string.Format("Expected doc: {0} | Actual doc: {1}\n", expectedTopDocs.ScoreDocs[i].Doc, actualTopDocs.ScoreDocs[i].Doc); string.Format("Expected score: {0} | Actual score: {1}\n", expectedTopDocs.ScoreDocs[i].Score, actualTopDocs.ScoreDocs[i].Score); } assertEquals(expectedTopDocs.ScoreDocs[i].Doc, actualTopDocs.ScoreDocs[i].Doc); assertEquals(expectedTopDocs.ScoreDocs[i].Score, actualTopDocs.ScoreDocs[i].Score, 0.0f); Explanation explanation = indexSearcher.Explain(joinQuery, expectedTopDocs.ScoreDocs[i].Doc); assertEquals(expectedTopDocs.ScoreDocs[i].Score, explanation.Value, 0.0f); } } topLevelReader.Dispose(); dir.Dispose(); } }
public virtual void Test() { // LUCENENET specific - disable the test if not 64 bit AssumeTrue("This test consumes too much RAM be run on x86.", Constants.RUNTIME_IS_64BIT); MockDirectoryWrapper dir = new MockDirectoryWrapper(Random, new MMapDirectory(CreateTempDir("4GBStoredFields"))); dir.Throttling = Throttling.NEVER; var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(new ConcurrentMergeScheduler()) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(OpenMode.CREATE); IndexWriter w = new IndexWriter(dir, config); MergePolicy mp = w.Config.MergePolicy; if (mp is LogByteSizeMergePolicy) { // 1 petabyte: ((LogByteSizeMergePolicy)mp).MaxMergeMB = 1024 * 1024 * 1024; } Document doc = new Document(); FieldType ft = new FieldType(); ft.IsIndexed = false; ft.IsStored = true; ft.Freeze(); int valueLength = RandomInts.RandomInt32Between(Random, 1 << 13, 1 << 20); var value = new byte[valueLength]; for (int i = 0; i < valueLength; ++i) { // random so that even compressing codecs can't compress it value[i] = (byte)Random.Next(256); } Field f = new Field("fld", value, ft); doc.Add(f); int numDocs = (int)((1L << 32) / valueLength + 100); for (int i = 0; i < numDocs; ++i) { w.AddDocument(doc); if (Verbose && i % (numDocs / 10) == 0) { Console.WriteLine(i + " of " + numDocs + "..."); } } w.ForceMerge(1); w.Dispose(); if (Verbose) { bool found = false; foreach (string file in dir.ListAll()) { if (file.EndsWith(".fdt", StringComparison.Ordinal)) { long fileLength = dir.FileLength(file); if (fileLength >= 1L << 32) { found = true; } Console.WriteLine("File length of " + file + " : " + fileLength); } } if (!found) { Console.WriteLine("No .fdt file larger than 4GB, test bug?"); } } DirectoryReader rd = DirectoryReader.Open(dir); Document sd = rd.Document(numDocs - 1); Assert.IsNotNull(sd); Assert.AreEqual(1, sd.Fields.Count); BytesRef valueRef = sd.GetBinaryValue("fld"); Assert.IsNotNull(valueRef); Assert.AreEqual(new BytesRef(value), valueRef); rd.Dispose(); dir.Dispose(); }
// randomly seeks to term that we know exists, then next's // from there private void DoTestSeekExists(Random r, IList <Term> fieldTerms, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); // Test random seek to existing term, then enum: if (Verbose) { Console.WriteLine("\nTEST: top now seek"); } int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // pick random field+term int spot = r.Next(fieldTerms.Count); Term term = fieldTerms[spot]; string field = term.Field; if (Verbose) { Console.WriteLine("TEST: exist seek field=" + field + " term=" + UnicodeUtil.ToHexString(term.Text())); } // seek to it if (!tes.TryGetValue(field, out TermsEnum te)) { te = MultiFields.GetTerms(reader, field).GetEnumerator(); tes[field] = te; } if (Verbose) { Console.WriteLine(" done get enum"); } // seek should find the term Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(term.Bytes)); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsFalse(te.MoveNext()); break; } else { Assert.IsTrue(te.MoveNext()); BytesRef t = te.Term; if (Verbose) { Console.WriteLine(" got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text().ToString())); } Assert.AreEqual(term.Bytes, t); } } } }
public async Task TestConcurrentAccess() { assertEquals(1, searchers.Count); using IndexReader r = DirectoryReader.Open(userindex); spellChecker.ClearIndex(); assertEquals(2, searchers.Count); Addwords(r, spellChecker, "field1"); assertEquals(3, searchers.Count); int num_field1 = this.NumDoc(); Addwords(r, spellChecker, "field2"); assertEquals(4, searchers.Count); int num_field2 = this.NumDoc(); assertEquals(num_field2, num_field1 + 1); int numThreads = 5 + Random.nextInt(5); var tasks = new ConcurrentBag <Task>(); SpellCheckWorker[] workers = new SpellCheckWorker[numThreads]; var executor = new LimitedConcurrencyLevelTaskScheduler(numThreads); // LUCENENET NOTE: Not sure why in Java they decided to pass the max concurrent threads as all of the threads, but this demonstrates how to use a custom TaskScheduler in .NET. using var shutdown = new CancellationTokenSource(); var cancellationToken = shutdown.Token; var stop = new AtomicBoolean(false); var taskFactory = new TaskFactory(executor); for (int i = 0; i < numThreads; i++) { SpellCheckWorker spellCheckWorker = new SpellCheckWorker(this, r, stop, cancellationToken, taskNum: i); workers[i] = spellCheckWorker; tasks.Add(taskFactory.StartNew(() => spellCheckWorker.Run(), cancellationToken)); } int iterations = 5 + Random.nextInt(5); for (int i = 0; i < iterations; i++) { Thread.Sleep(100); // concurrently reset the spell index spellChecker.SetSpellIndex(this.spellindex); // for debug - prints the internal open searchers // showSearchersOpen(); } stop.Value = true; executor.Shutdown(); // Stop allowing tasks to queue try { // wait for 60 seconds - usually this is very fast but coverage runs could take quite long shutdown.CancelAfter(TimeSpan.FromSeconds(60)); await Task.WhenAll(tasks.ToArray()); } catch (OperationCanceledException) { if (Verbose) { Console.WriteLine($"\n{nameof(OperationCanceledException)} thrown\n(safe shutdown after timeout)"); } } finally { shutdown.Dispose(); spellChecker.Dispose(); // In Lucene, this was the line that did "stop" and the running task responded to the AlreadyClosedException to break out of the loop, but we are using AtomicBoolean to signal instead. } for (int i = 0; i < workers.Length; i++) { assertFalse(string.Format(CultureInfo.InvariantCulture, "worker thread {0} failed \n" + workers[i].Error, i), workers[i].Error != null); assertTrue(string.Format(CultureInfo.InvariantCulture, "worker thread {0} is still running but should be terminated", i), workers[i].terminated); } // 4 searchers more than iterations // 1. at creation // 2. clearIndex() // 2. and 3. during addwords assertEquals(iterations + 4, searchers.Count); AssertSearchersClosed(); }