private void CreateRandomIndexes(int maxSegments) { dir = NewDirectory(); numDocs = AtLeast(150); int numTerms = TestUtil.NextInt32(Random, 1, numDocs / 5); ISet <string> randomTerms = new JCG.HashSet <string>(); while (randomTerms.size() < numTerms) { randomTerms.add(TestUtil.RandomSimpleString(Random)); } terms = new JCG.List <string>(randomTerms); long seed = Random.NextInt64(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new J2N.Randomizer(seed))); iwc.SetMergePolicy(TestSortingMergePolicy.NewSortingMergePolicy(sort)); iw = new RandomIndexWriter(new J2N.Randomizer(seed), dir, iwc); for (int i = 0; i < numDocs; ++i) { Document doc = RandomDocument(); iw.AddDocument(doc); if (i == numDocs / 2 || (i != numDocs - 1 && Random.nextInt(8) == 0)) { iw.Commit(); } if (Random.nextInt(15) == 0) { string term = RandomPicks.RandomFrom(Random, terms); iw.DeleteDocuments(new Term("s", term)); } } reader = iw.GetReader(); }
private void TestRandomWords(int maxNumWords, int numIter) { Random random = new Random(Random.Next()); for (int iter = 0; iter < numIter; iter++) { if (Verbose) { Console.WriteLine("\nTEST: iter " + iter); } for (int inputMode = 0; inputMode < 2; inputMode++) { int numWords = random.nextInt(maxNumWords + 1); ISet <Int32sRef> termsSet = new JCG.HashSet <Int32sRef>(); Int32sRef[] terms = new Int32sRef[numWords]; while (termsSet.size() < numWords) { string term = FSTTester <object> .GetRandomString(random); termsSet.Add(FSTTester <object> .ToInt32sRef(term, inputMode)); } DoTest(inputMode, termsSet.ToArray()); } } }
public void TestWithContexts() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); KeyValuePair <List <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), true, true); IDictionary <string, Document> docs = res.Value; List <string> invalidDocTerms = res.Key; foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); //Document doc = docs.remove(f.utf8ToString()); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); ISet <BytesRef> oriCtxs = new JCG.HashSet <BytesRef>(); IEnumerable <BytesRef> contextSet = inputIterator.Contexts; foreach (IIndexableField ctxf in doc.GetFields(CONTEXT_FIELD_NAME)) { oriCtxs.add(ctxf.GetBinaryValue()); } assertEquals(oriCtxs.size(), contextSet.Count()); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
public void TestMultiThreaded() { FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); PerfRunData runData = createPerfRunData(file, false, typeof(ThreadingDocMaker).AssemblyQualifiedName); ThreadJob[] threads = new ThreadJob[10]; using (WriteLineDocTask wldt = new WriteLineDocTask(runData)) { for (int i = 0; i < threads.Length; i++) { threads[i] = new ThreadAnonymousHelper("t" + i, wldt); } foreach (ThreadJob t in threads) { t.Start(); } foreach (ThreadJob t in threads) { t.Join(); } } // wldt.Dispose(); ISet <String> ids = new JCG.HashSet <string>(); TextReader br = new StreamReader(new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.None), Encoding.UTF8); try { String line = br.ReadLine(); assertHeaderLine(line); // header line is written once, no matter how many threads there are for (int i = 0; i < threads.Length; i++) { line = br.ReadLine(); assertNotNull($"line for index {i} is missing", line); // LUCENENET specific - ensure the line is there before splitting String[] parts = line.Split(WriteLineDocTask.SEP).TrimEnd(); assertEquals(line, 3, parts.Length); // check that all thread names written are the same in the same line String tname = parts[0].Substring(parts[0].IndexOf('_')); ids.add(tname); assertEquals(tname, parts[1].Substring(parts[1].IndexOf('_'))); assertEquals(tname, parts[2].Substring(parts[2].IndexOf('_'))); } // only threads.length lines should exist assertNull(br.ReadLine()); assertEquals(threads.Length, ids.size()); } finally { br.Dispose(); } }
/** * Makes a bunch of single-char tokens (the max # unique terms will at most be 26). * puts the # unique terms into expected, to be checked against the norm. */ private string AddValue() { StringBuilder sb = new StringBuilder(); ISet <string> terms = new JCG.HashSet <string>(); int num = TestUtil.NextInt32(Random, 0, 255); for (int i = 0; i < num; i++) { sb.append(' '); char term = (char)TestUtil.NextInt32(Random, 'a', 'z'); sb.append(term); terms.add("" + term); } expected.Add(terms.size()); return(sb.toString()); }
public void TestFastFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.ProcessingMode = (ProcessingMode.PM_FAST_INVALIDATION); ISet <string> results = new JCG.HashSet <string>(); ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs; assertTrue("Filtered searching should have found some matches", hits.Length > 0); foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); assertFalse("No duplicate urls should be returned", results.contains(url)); results.add(url); } assertEquals("Two urls found", 2, results.size()); }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new JCG.HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (Verbose) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (Verbose) { Console.Write(" " + docs[i][j]); } } if (Verbose) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (Verbose) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputIterator(this, docs)); // Build inefficient but hopefully correct model: List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (Verbose) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new JCG.Dictionary <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); if (!model.TryGetValue(token, out int?curCount) || curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (Verbose) { Console.WriteLine(" add '" + token + "' -> count=" + (model.TryGetValue(token, out int?count) ? (count.HasValue ? count.ToString() : "null") : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>(); double backoff = 1.0; seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (Verbose) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (Verbose) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (Verbose) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (Verbose) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; if (!gramCount.TryGetValue(context, out int?count) || count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (Verbose) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (Verbose) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (Verbose) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (Verbose) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (Verbose) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); if (model.TryGetValue(ngram, out int?count) && count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); tmp.Add(lr); if (Verbose) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (Verbose) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (Verbose) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }
private void CreateRandomIndexes() { dir1 = NewDirectory(); dir2 = NewDirectory(); int numDocs = AtLeast(150); int numTerms = TestUtil.NextInt32(Random, 1, numDocs / 5); ISet <string> randomTerms = new JCG.HashSet <string>(); while (randomTerms.size() < numTerms) { randomTerms.add(TestUtil.RandomSimpleString(Random)); } terms = new JCG.List <string>(randomTerms); long seed = Random.NextInt64(); IndexWriterConfig iwc1 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed))); IndexWriterConfig iwc2 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed))); iwc2.SetMergePolicy(NewSortingMergePolicy(sort)); RandomIndexWriter iw1 = new RandomIndexWriter(new Random((int)seed), dir1, iwc1); RandomIndexWriter iw2 = new RandomIndexWriter(new Random((int)seed), dir2, iwc2); for (int i = 0; i < numDocs; ++i) { if (Random.nextInt(5) == 0 && i != numDocs - 1) { string term = RandomPicks.RandomFrom(Random, terms); iw1.DeleteDocuments(new Term("s", term)); iw2.DeleteDocuments(new Term("s", term)); } Document doc = randomDocument(); iw1.AddDocument(doc); iw2.AddDocument(doc); if (Random.nextInt(8) == 0) { iw1.Commit(); iw2.Commit(); } } // Make sure we have something to merge iw1.Commit(); iw2.Commit(); Document doc2 = randomDocument(); // NOTE: don't use RIW.addDocument directly, since it sometimes commits // which may trigger a merge, at which case forceMerge may not do anything. // With field updates this is a problem, since the updates can go into the // single segment in the index, and threefore the index won't be sorted. // This hurts the assumption of the test later on, that the index is sorted // by SortingMP. iw1.IndexWriter.AddDocument(doc2); iw2.IndexWriter.AddDocument(doc2); if (DefaultCodecSupportsFieldUpdates) { // update NDV of docs belonging to one term (covers many documents) long value = Random.NextInt64(); string term = RandomPicks.RandomFrom(Random, terms); iw1.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value); iw2.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value); } iw1.ForceMerge(1); iw2.ForceMerge(1); iw1.Dispose(); iw2.Dispose(); reader = DirectoryReader.Open(dir1); sortedReader = DirectoryReader.Open(dir2); }