/// <summary> /// Add an element to the tree respecting a size limit /// </summary> /// <param name="results"> the tree to add in </param> /// <param name="result"> the result we try to add </param> /// <param name="num"> size limit </param> private static void BoundedTreeAdd(SortedSet <Lookup.LookupResult> results, Lookup.LookupResult result, int num) { if (results.Count >= num) { if (results.Min.value < result.value) { lock (syncLock) { if (results.Min.value < result.value) { // Code similar to the java TreeMap class var entry = results.FirstOrDefault(); if (entry != null) { results.Remove(entry); } } else { return; } } } else { return; } } results.Add(result); }
/// <summary> /// Add an element to the tree respecting a size limit /// </summary> /// <param name="results"> the tree to add in </param> /// <param name="result"> the result we try to add </param> /// <param name="num"> size limit </param> private static void BoundedTreeAdd(SortedSet <Lookup.LookupResult> results, Lookup.LookupResult result, int num) { if (results.Count >= num) { if (results.Min.value < result.value) { results.PollFirst(); } else { return; } } results.Add(result); }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (VERBOSE) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (VERBOSE) { Console.Write(" " + docs[i][j]); } } if (VERBOSE) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (VERBOSE) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputIterator(this, docs)); // Build inefficient but hopefully correct model: List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (VERBOSE) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new HashMap <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); int? curCount = model.ContainsKey(token) ? model[token] : null; if (curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (VERBOSE) { Console.WriteLine(" add '" + token + "' -> count=" + (model.ContainsKey(token) ? model[token].ToString() : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>(); double backoff = 1.0; seen = new HashSet <string>(); if (VERBOSE) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (VERBOSE) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (VERBOSE) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (VERBOSE) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (VERBOSE) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; int?count = gramCount.ContainsKey(context) ? gramCount[context] : null; if (count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (VERBOSE) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (VERBOSE) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (VERBOSE) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (VERBOSE) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (VERBOSE) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); int?count = model.ContainsKey(ngram) ? model[ngram] : null; if (count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); tmp.Add(lr); if (VERBOSE) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (VERBOSE) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (VERBOSE) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }
/// <summary> /// Add an element to the tree respecting a size limit /// </summary> /// <param name="results"> the tree to add in </param> /// <param name="result"> the result we try to add </param> /// <param name="num"> size limit </param> private static void BoundedTreeAdd(JCG.SortedSet <Lookup.LookupResult> results, Lookup.LookupResult result, int num) { if (results.Count >= num) { var first = results.Min; // "get" our first object so we don't cross threads if (first.Value < result.Value) { // Code similar to the java TreeMap class results.Remove(first); } else { return; } } results.Add(result); }
public void TestBasicContext() { Input[] keys = new Input[] { new Input("lend me your ear", 8, new BytesRef("foobar"), AsSet("foo", "bar")), new Input("a penny saved is a penny earned", 10, new BytesRef("foobaz"), AsSet("foo", "baz")) }; DirectoryInfo tempDir = CreateTempDir("analyzingInfixContext"); for (int iter = 0; iter < 2; iter++) { AnalyzingInfixSuggester suggester; Analyzer a = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false); if (iter == 0) { suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a, a, 3); suggester.Build(new InputArrayIterator(keys)); } else { // Test again, after close/reopen: suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a, a, 3); } // No context provided, all results returned IList <Lookup.LookupResult> results = suggester.DoLookup(TestUtil.StringToCharSequence("ear", Random()).ToString(), 10, true, true); assertEquals(2, results.size()); Lookup.LookupResult result = results.ElementAt(0); assertEquals("a penny saved is a penny <b>ear</b>ned", result.key); assertEquals(10, result.value); assertEquals(new BytesRef("foobaz"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("baz"))); result = results.ElementAt(1); assertEquals("lend me your <b>ear</b>", result.key); assertEquals(8, result.value); assertEquals(new BytesRef("foobar"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("bar"))); // Both suggestions have "foo" context: results = suggester.DoLookup(TestUtil.StringToCharSequence("ear", Random()).ToString(), AsSet("foo"), 10, true, true); assertEquals(2, results.size()); result = results.ElementAt(0); assertEquals("a penny saved is a penny <b>ear</b>ned", result.key); assertEquals(10, result.value); assertEquals(new BytesRef("foobaz"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("baz"))); result = results.ElementAt(1); assertEquals("lend me your <b>ear</b>", result.key); assertEquals(8, result.value); assertEquals(new BytesRef("foobar"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("bar"))); // Only one has "bar" context: results = suggester.DoLookup(TestUtil.StringToCharSequence("ear", Random()).ToString(), AsSet("bar"), 10, true, true); assertEquals(1, results.size()); result = results.ElementAt(0); assertEquals("lend me your <b>ear</b>", result.key); assertEquals(8, result.value); assertEquals(new BytesRef("foobar"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("bar"))); // Only one has "baz" context: results = suggester.DoLookup(TestUtil.StringToCharSequence("ear", Random()).ToString(), AsSet("baz"), 10, true, true); assertEquals(1, results.size()); result = results.ElementAt(0); assertEquals("a penny saved is a penny <b>ear</b>ned", result.key); assertEquals(10, result.value); assertEquals(new BytesRef("foobaz"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("baz"))); // Both have foo or bar: results = suggester.DoLookup(TestUtil.StringToCharSequence("ear", Random()).ToString(), AsSet("foo", "bar"), 10, true, true); assertEquals(2, results.size()); result = results.ElementAt(0); assertEquals("a penny saved is a penny <b>ear</b>ned", result.key); assertEquals(10, result.value); assertEquals(new BytesRef("foobaz"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("baz"))); result = results.ElementAt(1); assertEquals("lend me your <b>ear</b>", result.key); assertEquals(8, result.value); assertEquals(new BytesRef("foobar"), result.payload); assertNotNull(result.contexts); assertEquals(2, result.contexts.Count()); assertTrue(result.contexts.Contains(new BytesRef("foo"))); assertTrue(result.contexts.Contains(new BytesRef("bar"))); suggester.Dispose(); } }