private void CreateRandomIndexes(int maxSegments) { dir = NewDirectory(); numDocs = AtLeast(150); int numTerms = TestUtil.NextInt32(Random, 1, numDocs / 5); ISet <string> randomTerms = new JCG.HashSet <string>(); while (randomTerms.size() < numTerms) { randomTerms.add(TestUtil.RandomSimpleString(Random)); } terms = new JCG.List <string>(randomTerms); long seed = Random.NextInt64(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new J2N.Randomizer(seed))); iwc.SetMergePolicy(TestSortingMergePolicy.NewSortingMergePolicy(sort)); iw = new RandomIndexWriter(new J2N.Randomizer(seed), dir, iwc); for (int i = 0; i < numDocs; ++i) { Document doc = RandomDocument(); iw.AddDocument(doc); if (i == numDocs / 2 || (i != numDocs - 1 && Random.nextInt(8) == 0)) { iw.Commit(); } if (Random.nextInt(15) == 0) { string term = RandomPicks.RandomFrom(Random, terms); iw.DeleteDocuments(new Term("s", term)); } } reader = iw.GetReader(); }
public virtual void runTestQuery(SpatialMatchConcern concern, SpatialTestQuery q) { String msg = q.toString(); //"Query: " + q.args.toString(ctx); SearchResults got = executeQuery(makeQuery(q), Math.Max(100, q.ids.size() + 1)); if (storeShape && got.numFound > 0) { //check stored value is there assertNotNull(got.results[0].document.Get(strategy.FieldName)); } if (concern.orderIsImportant) { IEnumerator<String> ids = q.ids.GetEnumerator(); foreach (SearchResult r in got.results) { String id = r.document.Get("id"); if (!ids.MoveNext()) { fail(msg + " :: Did not get enough results. Expect" + q.ids + ", got: " + got.toDebugString()); } assertEquals("out of order: " + msg, ids.Current, id); } if (ids.MoveNext()) { fail(msg + " :: expect more results then we got: " + ids.Current); } } else { // We are looking at how the results overlap if (concern.resultsAreSuperset) { ISet<string> found = new JCG.HashSet<string>(); foreach (SearchResult r in got.results) { found.add(r.document.Get("id")); } foreach (String s in q.ids) { if (!found.contains(s)) { fail("Results are mising id: " + s + " :: " + found); } } } else { List<string> found = new List<string>(); foreach (SearchResult r in got.results) { found.Add(r.document.Get("id")); } // sort both so that the order is not important CollectionUtil.TimSort(q.ids); CollectionUtil.TimSort(found); assertEquals(msg, q.ids.toString(), found.toString()); } } }
public void TestWithContexts() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); KeyValuePair <List <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), true, true); IDictionary <string, Document> docs = res.Value; List <string> invalidDocTerms = res.Key; foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); //Document doc = docs.remove(f.utf8ToString()); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); ISet <BytesRef> oriCtxs = new JCG.HashSet <BytesRef>(); IEnumerable <BytesRef> contextSet = inputIterator.Contexts; foreach (IIndexableField ctxf in doc.GetFields(CONTEXT_FIELD_NAME)) { oriCtxs.add(ctxf.GetBinaryValue()); } assertEquals(oriCtxs.size(), contextSet.Count()); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
public void TestMultiThreaded() { FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); PerfRunData runData = createPerfRunData(file, false, typeof(ThreadingDocMaker).AssemblyQualifiedName); ThreadJob[] threads = new ThreadJob[10]; using (WriteLineDocTask wldt = new WriteLineDocTask(runData)) { for (int i = 0; i < threads.Length; i++) { threads[i] = new ThreadAnonymousHelper("t" + i, wldt); } foreach (ThreadJob t in threads) { t.Start(); } foreach (ThreadJob t in threads) { t.Join(); } } // wldt.Dispose(); ISet <String> ids = new JCG.HashSet <string>(); TextReader br = new StreamReader(new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.None), Encoding.UTF8); try { String line = br.ReadLine(); assertHeaderLine(line); // header line is written once, no matter how many threads there are for (int i = 0; i < threads.Length; i++) { line = br.ReadLine(); assertNotNull($"line for index {i} is missing", line); // LUCENENET specific - ensure the line is there before splitting String[] parts = line.Split(WriteLineDocTask.SEP).TrimEnd(); assertEquals(line, 3, parts.Length); // check that all thread names written are the same in the same line String tname = parts[0].Substring(parts[0].IndexOf('_')); ids.add(tname); assertEquals(tname, parts[1].Substring(parts[1].IndexOf('_'))); assertEquals(tname, parts[2].Substring(parts[2].IndexOf('_'))); } // only threads.length lines should exist assertNull(br.ReadLine()); assertEquals(threads.Length, ids.size()); } finally { br.Dispose(); } }
public void TestWithContext() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { string field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault(); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, (w1 + w2 + w3)); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); // LUCENENET NOTE: This test was once failing because we used SCG.HashSet<T> whose // Equals() implementation does not check for set equality. As a result SortedInputEnumerator // had been modified to reverse the results to get the test to pass. However, using JCG.HashSet<T> // ensures that set equality (that is equality that doesn't care about order of items) is respected. // SortedInputEnumerator has also had the specific sorting removed. ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>(); foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME)) { originalCtxs.add(ctxf.GetBinaryValue()); } assertEquals(originalCtxs, inputIterator.Contexts); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
public void TestDefaultFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); ISet <string> results = new JCG.HashSet <string>(); ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs; foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); assertFalse("No duplicate urls should be returned", results.contains(url)); results.add(url); } }
/** * Makes a bunch of single-char tokens (the max # unique terms will at most be 26). * puts the # unique terms into expected, to be checked against the norm. */ private string AddValue() { StringBuilder sb = new StringBuilder(); ISet <string> terms = new JCG.HashSet <string>(); int num = TestUtil.NextInt32(Random, 0, 255); for (int i = 0; i < num; i++) { sb.append(' '); char term = (char)TestUtil.NextInt32(Random, 'a', 'z'); sb.append(term); terms.add("" + term); } expected.Add(terms.size()); return(sb.toString()); }
protected virtual void AssertOperation(IDictionary<String, IShape> indexedDocs, SpatialOperation operation, IShape queryShape) { //Generate truth via brute force ISet<string> expectedIds = new JCG.HashSet<string>(); foreach (var stringShapeEntry in indexedDocs) { if (operation.Evaluate(stringShapeEntry.Value, queryShape)) expectedIds.add(stringShapeEntry.Key); } SpatialTestQuery testQuery = new SpatialTestQuery(); testQuery.args = new SpatialArgs(operation, queryShape); testQuery.ids = new List<string>(expectedIds); runTestQuery(SpatialMatchConcern.FILTER, testQuery); }
/** * read a set of queries from a resource file */ private ISet <string> ReadQueries(string resource) { ISet <string> queries = new JCG.HashSet <string>(); Stream stream = GetType().getResourceAsStream(resource); TextReader reader = new StreamReader(stream, Encoding.UTF8); String line = null; while ((line = reader.ReadLine()) != null) { line = line.Trim(); if (line.Length > 0 && !line.StartsWith("#", StringComparison.Ordinal) && !line.StartsWith("//", StringComparison.Ordinal)) { queries.add(line); } } return(queries); }
public void TestWithContext() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault(); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, (w1 + w2 + w3)); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>(); foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME)) { originalCtxs.add(ctxf.GetBinaryValue()); } assertEquals(originalCtxs, inputIterator.Contexts); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
private void checkHits(SpatialArgs args, int assertNumFound, int[] assertIds) { SearchResults got = executeQuery(strategy.MakeQuery(args), 100); assertEquals("" + args, assertNumFound, got.numFound); if (assertIds != null) { ISet <int?> gotIds = new JCG.HashSet <int?>(); foreach (SearchResult result in got.results) { gotIds.add(int.Parse(result.document.Get("id"), CultureInfo.InvariantCulture)); } foreach (int assertId in assertIds) { assertTrue("has " + assertId, gotIds.contains(assertId)); } } }
public void TestFastFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.ProcessingMode = (ProcessingMode.PM_FAST_INVALIDATION); ISet <string> results = new JCG.HashSet <string>(); ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs; assertTrue("Filtered searching should have found some matches", hits.Length > 0); foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); assertFalse("No duplicate urls should be returned", results.contains(url)); results.add(url); } assertEquals("Two urls found", 2, results.size()); }
public void TestNoFilter() { ISet <string> results = new JCG.HashSet <string>(); ScoreDoc[] hits = searcher.Search(tq, null, 1000).ScoreDocs; assertTrue("Default searching should have found some matches", hits.Length > 0); bool dupsFound = false; foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); if (!dupsFound) { dupsFound = results.contains(url); } results.add(url); } assertTrue("Default searching should have found duplicate urls", dupsFound); }
public object Create(Random random) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry ISet <string> keys = new JCG.HashSet <string>(); int num = random.nextInt(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { string key = TestUtil.RandomSimpleString(random); if (!keys.contains(key) && key.Length > 0) { string value = TestUtil.RandomSimpleString(random); builder.Add(key, value); keys.add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return(builder.Build()); }
private void _CheckHits(bool bbox, IPoint pt, double distKM, int assertNumFound, params int[] assertIds) { SpatialOperation op = SpatialOperation.Intersects; double distDEG = DistanceUtils.Dist2Degrees(distKM, DistanceUtils.EARTH_MEAN_RADIUS_KM); IShape shape = ctx.MakeCircle(pt, distDEG); if (bbox) { shape = shape.BoundingBox; } SpatialArgs args = new SpatialArgs(op, shape); //args.setDistPrecision(0.025); Query query; if (Random.nextBoolean()) { query = strategy.MakeQuery(args); } else { query = new FilteredQuery(new MatchAllDocsQuery(), strategy.MakeFilter(args)); } SearchResults results = executeQuery(query, 100); assertEquals("" + shape, assertNumFound, results.numFound); if (assertIds != null) { ISet <int?> resultIds = new JCG.HashSet <int?>(); foreach (SearchResult result in results.results) { resultIds.add(int.Parse(result.document.Get("id"), CultureInfo.InvariantCulture)); } foreach (int assertId in assertIds) { assertTrue("has " + assertId, resultIds.contains(assertId)); } } }
private GroupedFacetResult CreateExpectedFacetResult(string searchTerm, IndexContext context, int offset, int limit, int minCount, bool orderByCount, string facetPrefix) { JCG.Dictionary <string, ISet <string> > facetGroups; if (!context.searchTermToFacetGroups.TryGetValue(searchTerm, out facetGroups)) { facetGroups = new JCG.Dictionary <string, ISet <string> >(); } int totalCount = 0; int totalMissCount = 0; ISet <string> facetValues; if (facetPrefix != null) { facetValues = new JCG.HashSet <string>(); foreach (string facetValue in context.facetValues) { if (facetValue != null && facetValue.StartsWith(facetPrefix, StringComparison.Ordinal)) { facetValues.add(facetValue); } } } else { facetValues = context.facetValues; } List <TermGroupFacetCollector.FacetEntry> entries = new List <TermGroupFacetCollector.FacetEntry>(facetGroups.size()); // also includes facets with count 0 foreach (string facetValue in facetValues) { if (facetValue == null) { continue; } int count = facetGroups.TryGetValue(facetValue, out ISet <string> groups) && groups != null?groups.size() : 0; if (count >= minCount) { entries.Add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count)); } totalCount += count; } // Only include null count when no facet prefix is specified if (facetPrefix == null) { if (facetGroups.TryGetValue(null, out ISet <string> groups) && groups != null) { totalMissCount = groups.size(); } } entries.Sort(Comparer <TermGroupFacetCollector.FacetEntry> .Create((a, b) => { if (orderByCount) { int cmp = b.Count - a.Count; if (cmp != 0) { return(cmp); } } return(a.Value.CompareTo(b.Value)); })); int endOffset = offset + limit; IList <TermGroupFacetCollector.FacetEntry> entriesResult; if (offset >= entries.size()) { entriesResult = Collections.EmptyList <TermGroupFacetCollector.FacetEntry>(); } else if (endOffset >= entries.size()) { entriesResult = entries.GetRange(offset, entries.size() - offset); } else { entriesResult = entries.GetRange(offset, endOffset - offset); } return(new GroupedFacetResult(totalCount, totalMissCount, entriesResult)); }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new JCG.HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (Verbose) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (Verbose) { Console.Write(" " + docs[i][j]); } } if (Verbose) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (Verbose) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputIterator(this, docs)); // Build inefficient but hopefully correct model: List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (Verbose) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new JCG.Dictionary <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); if (!model.TryGetValue(token, out int?curCount) || curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (Verbose) { Console.WriteLine(" add '" + token + "' -> count=" + (model.TryGetValue(token, out int?count) ? (count.HasValue ? count.ToString() : "null") : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>(); double backoff = 1.0; seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (Verbose) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (Verbose) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (Verbose) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (Verbose) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; if (!gramCount.TryGetValue(context, out int?count) || count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (Verbose) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (Verbose) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (Verbose) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (Verbose) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (Verbose) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); if (model.TryGetValue(ngram, out int?count) && count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); tmp.Add(lr); if (Verbose) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (Verbose) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (Verbose) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }
private void CreateRandomIndexes() { dir1 = NewDirectory(); dir2 = NewDirectory(); int numDocs = AtLeast(150); int numTerms = TestUtil.NextInt32(Random, 1, numDocs / 5); ISet <string> randomTerms = new JCG.HashSet <string>(); while (randomTerms.size() < numTerms) { randomTerms.add(TestUtil.RandomSimpleString(Random)); } terms = new JCG.List <string>(randomTerms); long seed = Random.NextInt64(); IndexWriterConfig iwc1 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed))); IndexWriterConfig iwc2 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed))); iwc2.SetMergePolicy(NewSortingMergePolicy(sort)); RandomIndexWriter iw1 = new RandomIndexWriter(new Random((int)seed), dir1, iwc1); RandomIndexWriter iw2 = new RandomIndexWriter(new Random((int)seed), dir2, iwc2); for (int i = 0; i < numDocs; ++i) { if (Random.nextInt(5) == 0 && i != numDocs - 1) { string term = RandomPicks.RandomFrom(Random, terms); iw1.DeleteDocuments(new Term("s", term)); iw2.DeleteDocuments(new Term("s", term)); } Document doc = randomDocument(); iw1.AddDocument(doc); iw2.AddDocument(doc); if (Random.nextInt(8) == 0) { iw1.Commit(); iw2.Commit(); } } // Make sure we have something to merge iw1.Commit(); iw2.Commit(); Document doc2 = randomDocument(); // NOTE: don't use RIW.addDocument directly, since it sometimes commits // which may trigger a merge, at which case forceMerge may not do anything. // With field updates this is a problem, since the updates can go into the // single segment in the index, and threefore the index won't be sorted. // This hurts the assumption of the test later on, that the index is sorted // by SortingMP. iw1.IndexWriter.AddDocument(doc2); iw2.IndexWriter.AddDocument(doc2); if (DefaultCodecSupportsFieldUpdates) { // update NDV of docs belonging to one term (covers many documents) long value = Random.NextInt64(); string term = RandomPicks.RandomFrom(Random, terms); iw1.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value); iw2.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value); } iw1.ForceMerge(1); iw2.ForceMerge(1); iw1.Dispose(); iw2.Dispose(); reader = DirectoryReader.Open(dir1); sortedReader = DirectoryReader.Open(dir2); }
public void TestTerms() { Random random = Random; int num = AtLeast(10000); #pragma warning disable 612, 618 IComparer <BytesRef> comparer = random.nextBoolean() ? BytesRef.UTF8SortedAsUnicodeComparer : BytesRef.UTF8SortedAsUTF16Comparer; #pragma warning restore 612, 618 IDictionary <BytesRef, KeyValuePair <long, BytesRef> > sorted = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >(comparer); IDictionary <BytesRef, long> sortedWithoutPayload = new JCG.SortedDictionary <BytesRef, long>(comparer); IDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > > sortedWithContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > >(comparer); IDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > sortedWithPayloadAndContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > >(comparer); Input[] unsorted = new Input[num]; Input[] unsortedWithoutPayload = new Input[num]; Input[] unsortedWithContexts = new Input[num]; Input[] unsortedWithPayloadAndContext = new Input[num]; ISet <BytesRef> ctxs; for (int i = 0; i < num; i++) { BytesRef key2; BytesRef payload; ctxs = new JCG.HashSet <BytesRef>(); do { key2 = new BytesRef(TestUtil.RandomUnicodeString(random)); payload = new BytesRef(TestUtil.RandomUnicodeString(random)); for (int j = 0; j < AtLeast(2); j++) { ctxs.add(new BytesRef(TestUtil.RandomUnicodeString(random))); } } while (sorted.ContainsKey(key2)); long value = random.Next(); sortedWithoutPayload.Put(key2, value); sorted.Put(key2, new KeyValuePair <long, BytesRef>(value, payload)); sortedWithContext.Put(key2, new KeyValuePair <long, ISet <BytesRef> >(value, ctxs)); sortedWithPayloadAndContext.Put(key2, new KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > >(value, new KeyValuePair <BytesRef, ISet <BytesRef> >(payload, ctxs))); unsorted[i] = new Input(key2, value, payload); unsortedWithoutPayload[i] = new Input(key2, value); unsortedWithContexts[i] = new Input(key2, value, ctxs); unsortedWithPayloadAndContext[i] = new Input(key2, value, payload, ctxs); } // test the sorted iterator wrapper with payloads IInputIterator wrapper = new SortedInputIterator(new InputArrayIterator(unsorted), comparer); IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > > expected = sorted.GetEnumerator(); while (expected.MoveNext()) { KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > entry = expected.Current; assertEquals(entry.Key, wrapper.Next()); assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight); assertEquals(entry.Value.Value, wrapper.Payload); } assertNull(wrapper.Next()); // test the sorted iterator wrapper with contexts wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithContexts), comparer); IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > > actualEntries = sortedWithContext.GetEnumerator(); while (actualEntries.MoveNext()) { KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > entry = actualEntries.Current; assertEquals(entry.Key, wrapper.Next()); assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight); ISet <BytesRef> actualCtxs = entry.Value.Value; assertEquals(actualCtxs, wrapper.Contexts); } assertNull(wrapper.Next()); // test the sorted iterator wrapper with contexts and payload wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithPayloadAndContext), comparer); IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > > expectedPayloadContextEntries = sortedWithPayloadAndContext.GetEnumerator(); while (expectedPayloadContextEntries.MoveNext()) { KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > entry = expectedPayloadContextEntries.Current; assertEquals(entry.Key, wrapper.Next()); assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight); ISet <BytesRef> actualCtxs = entry.Value.Value.Value; assertEquals(actualCtxs, wrapper.Contexts); BytesRef actualPayload = entry.Value.Value.Key; assertEquals(actualPayload, wrapper.Payload); } assertNull(wrapper.Next()); // test the unsorted iterator wrapper with payloads wrapper = new UnsortedInputIterator(new InputArrayIterator(unsorted)); IDictionary <BytesRef, KeyValuePair <long, BytesRef> > actual = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >(); BytesRef key; while ((key = wrapper.Next()) != null) { long value = wrapper.Weight; BytesRef payload = wrapper.Payload; actual.Put(BytesRef.DeepCopyOf(key), new KeyValuePair <long, BytesRef>(value, BytesRef.DeepCopyOf(payload))); } assertEquals(sorted, actual); // test the sorted iterator wrapper without payloads IInputIterator wrapperWithoutPayload = new SortedInputIterator(new InputArrayIterator(unsortedWithoutPayload), comparer); IEnumerator <KeyValuePair <BytesRef, long> > expectedWithoutPayload = sortedWithoutPayload.GetEnumerator(); while (expectedWithoutPayload.MoveNext()) { KeyValuePair <BytesRef, long> entry = expectedWithoutPayload.Current; assertEquals(entry.Key, wrapperWithoutPayload.Next()); assertEquals(Convert.ToInt64(entry.Value), wrapperWithoutPayload.Weight); assertNull(wrapperWithoutPayload.Payload); } assertNull(wrapperWithoutPayload.Next()); // test the unsorted iterator wrapper without payloads wrapperWithoutPayload = new UnsortedInputIterator(new InputArrayIterator(unsortedWithoutPayload)); IDictionary <BytesRef, long> actualWithoutPayload = new JCG.SortedDictionary <BytesRef, long>(); while ((key = wrapperWithoutPayload.Next()) != null) { long value = wrapperWithoutPayload.Weight; assertNull(wrapperWithoutPayload.Payload); actualWithoutPayload.Put(BytesRef.DeepCopyOf(key), value); } assertEquals(sortedWithoutPayload, actualWithoutPayload); }