public virtual void runTestQuery(SpatialMatchConcern concern, SpatialTestQuery q) { String msg = q.toString(); //"Query: " + q.args.toString(ctx); SearchResults got = executeQuery(makeQuery(q), Math.Max(100, q.ids.size() + 1)); if (storeShape && got.numFound > 0) { //check stored value is there assertNotNull(got.results[0].document.Get(strategy.FieldName)); } if (concern.orderIsImportant) { IEnumerator<String> ids = q.ids.GetEnumerator(); foreach (SearchResult r in got.results) { String id = r.document.Get("id"); if (!ids.MoveNext()) { fail(msg + " :: Did not get enough results. Expect" + q.ids + ", got: " + got.toDebugString()); } assertEquals("out of order: " + msg, ids.Current, id); } if (ids.MoveNext()) { fail(msg + " :: expect more results then we got: " + ids.Current); } } else { // We are looking at how the results overlap if (concern.resultsAreSuperset) { ISet<string> found = new JCG.HashSet<string>(); foreach (SearchResult r in got.results) { found.add(r.document.Get("id")); } foreach (String s in q.ids) { if (!found.contains(s)) { fail("Results are mising id: " + s + " :: " + found); } } } else { List<string> found = new List<string>(); foreach (SearchResult r in got.results) { found.Add(r.document.Get("id")); } // sort both so that the order is not important CollectionUtil.TimSort(q.ids); CollectionUtil.TimSort(found); assertEquals(msg, q.ids.toString(), found.toString()); } } }
public void TestMultiWord() { FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); flt.AddTerms("jonathin smoth", "name", 0.3f, 1); Query q = flt.Rewrite(searcher.IndexReader); ISet <Term> queryTerms = new JCG.HashSet <Term>(); q.ExtractTerms(queryTerms); assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan"))); assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); TopDocs topDocs = searcher.Search(flt, 1); ScoreDoc[] sd = topDocs.ScoreDocs; assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0)); Document doc = searcher.Doc(sd[0].Doc); assertEquals("Should match most similar when using 2 words", "2", doc.Get("id")); }
public void TestClosestEditDistanceMatchComesFirst() { FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); flt.AddTerms("smith", "name", 0.3f, 1); Query q = flt.Rewrite(searcher.IndexReader); ISet <Term> queryTerms = new JCG.HashSet <Term>(); q.ExtractTerms(queryTerms); assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe"))); assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth"))); TopDocs topDocs = searcher.Search(flt, 1); ScoreDoc[] sd = topDocs.ScoreDocs; assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0)); Document doc = searcher.Doc(sd[0].Doc); assertEquals("Should match most similar not most rare variant", "2", doc.Get("id")); }
public void TestDefaultFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); ISet <string> results = new JCG.HashSet <string>(); ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs; foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); assertFalse("No duplicate urls should be returned", results.contains(url)); results.add(url); } }
private void checkHits(SpatialArgs args, int assertNumFound, int[] assertIds) { SearchResults got = executeQuery(strategy.MakeQuery(args), 100); assertEquals("" + args, assertNumFound, got.numFound); if (assertIds != null) { ISet <int?> gotIds = new JCG.HashSet <int?>(); foreach (SearchResult result in got.results) { gotIds.add(int.Parse(result.document.Get("id"), CultureInfo.InvariantCulture)); } foreach (int assertId in assertIds) { assertTrue("has " + assertId, gotIds.contains(assertId)); } } }
public void TestFastFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.ProcessingMode = (ProcessingMode.PM_FAST_INVALIDATION); ISet <string> results = new JCG.HashSet <string>(); ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs; assertTrue("Filtered searching should have found some matches", hits.Length > 0); foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); assertFalse("No duplicate urls should be returned", results.contains(url)); results.add(url); } assertEquals("Two urls found", 2, results.size()); }
public void TestNoFilter() { ISet <string> results = new JCG.HashSet <string>(); ScoreDoc[] hits = searcher.Search(tq, null, 1000).ScoreDocs; assertTrue("Default searching should have found some matches", hits.Length > 0); bool dupsFound = false; foreach (ScoreDoc hit in hits) { Document d = searcher.Doc(hit.Doc); string url = d.Get(KEY_FIELD); if (!dupsFound) { dupsFound = results.contains(url); } results.add(url); } assertTrue("Default searching should have found duplicate urls", dupsFound); }
public object Create(Random random) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry ISet <string> keys = new JCG.HashSet <string>(); int num = random.nextInt(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { string key = TestUtil.RandomSimpleString(random); if (!keys.contains(key) && key.Length > 0) { string value = TestUtil.RandomSimpleString(random); builder.Add(key, value); keys.add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return(builder.Build()); }
private void _CheckHits(bool bbox, IPoint pt, double distKM, int assertNumFound, params int[] assertIds) { SpatialOperation op = SpatialOperation.Intersects; double distDEG = DistanceUtils.Dist2Degrees(distKM, DistanceUtils.EARTH_MEAN_RADIUS_KM); IShape shape = ctx.MakeCircle(pt, distDEG); if (bbox) { shape = shape.BoundingBox; } SpatialArgs args = new SpatialArgs(op, shape); //args.setDistPrecision(0.025); Query query; if (Random.nextBoolean()) { query = strategy.MakeQuery(args); } else { query = new FilteredQuery(new MatchAllDocsQuery(), strategy.MakeFilter(args)); } SearchResults results = executeQuery(query, 100); assertEquals("" + shape, assertNumFound, results.numFound); if (assertIds != null) { ISet <int?> resultIds = new JCG.HashSet <int?>(); foreach (SearchResult result in results.results) { resultIds.add(int.Parse(result.document.Get("id"), CultureInfo.InvariantCulture)); } foreach (int assertId in assertIds) { assertTrue("has " + assertId, resultIds.contains(assertId)); } } }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new JCG.HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (Verbose) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (Verbose) { Console.Write(" " + docs[i][j]); } } if (Verbose) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (Verbose) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputIterator(this, docs)); // Build inefficient but hopefully correct model: List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (Verbose) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new JCG.Dictionary <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); if (!model.TryGetValue(token, out int?curCount) || curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (Verbose) { Console.WriteLine(" add '" + token + "' -> count=" + (model.TryGetValue(token, out int?count) ? (count.HasValue ? count.ToString() : "null") : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (Verbose) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>(); double backoff = 1.0; seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (Verbose) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (Verbose) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (Verbose) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (Verbose) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; if (!gramCount.TryGetValue(context, out int?count) || count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (Verbose) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (Verbose) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (Verbose) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (Verbose) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (Verbose) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); if (model.TryGetValue(ngram, out int?count) && count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); tmp.Add(lr); if (Verbose) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (Verbose) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (Verbose) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }