public TopNSearcherAnonymousInnerClassHelper(FreeTextSuggester outerInstance, FST <long?> fst, int num, UnknownType size, UnknownType weightComparator, HashSet <BytesRef> seen, BytesRef finalLastToken) : base(fst, num, size, weightComparator) { this.outerInstance = outerInstance; this.seen = seen; this.finalLastToken = finalLastToken; scratchBytes = new BytesRef(); }
public void TestNoDupsAcrossGrams() { IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle( new Input("foo bar bar bar bar", 50) ); Analyzer a = new MockAnalyzer(Random); FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20); sug.Build(new InputArrayIterator(keys)); assertEquals("foo bar/1.00", ToString(sug.DoLookup("foo b", 10))); }
public void TestTwoEndingHoles() { // Just deletes "of" Analyzer a = new TestEndingHoleAnalyzer(); IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle( new Input("wizard of of oz", 50) ); FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20); sug.Build(new InputArrayIterator(keys)); assertEquals("", ToString(sug.DoLookup("wizard of of", 10))); }
public void TestUnigrams() { IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle( new Input("foo bar baz blah boo foo bar foo bee", 50) ); Analyzer a = new MockAnalyzer(Random); FreeTextSuggester sug = new FreeTextSuggester(a, a, 1, (byte)0x20); sug.Build(new InputArrayIterator(keys)); // Sorts first by count, descending, second by term, ascending assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11", ToString(sug.DoLookup("b", 10))); }
public TopNSearcherAnonymousInnerClassHelper( FreeTextSuggester outerInstance, FST <long?> fst, int num, int size, IComparer <long?> weightComparer, IEnumerable <BytesRef> seen, BytesRef finalLastToken) : base(fst, num, size, weightComparer) { this.outerInstance = outerInstance; this.seen = seen; this.finalLastToken = finalLastToken; scratchBytes = new BytesRef(); }
public TopNSearcherAnonymousClass( FreeTextSuggester outerInstance, FST <Int64> fst, int num, int size, IComparer <Int64> weightComparer, ISet <BytesRef> seen, BytesRef finalLastToken) : base(fst, num, size, weightComparer) { this.outerInstance = outerInstance; this.seen = seen; this.finalLastToken = finalLastToken; scratchBytes = new BytesRef(); }
public void TestBasic() { IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle( new Input("foo bar baz blah", 50), new Input("boo foo bar foo bee", 20) ); Analyzer a = new MockAnalyzer(Random); FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20); sug.Build(new InputArrayIterator(keys)); assertEquals(2, sug.Count); for (int i = 0; i < 2; i++) { // Uses bigram model and unigram backoff: assertEquals("foo bar/0.67 foo bee/0.33 baz/0.04 blah/0.04 boo/0.04", ToString(sug.DoLookup("foo b", 10))); // Uses only bigram model: assertEquals("foo bar/0.67 foo bee/0.33", ToString(sug.DoLookup("foo ", 10))); // Uses only unigram model: assertEquals("foo/0.33", ToString(sug.DoLookup("foo", 10))); // Uses only unigram model: assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11", ToString(sug.DoLookup("b", 10))); // Try again after save/load: DirectoryInfo tmpDir = CreateTempDir("FreeTextSuggesterTest"); //tmpDir.Create(); FileInfo path = new FileInfo(Path.Combine(tmpDir.FullName, "suggester")); using (Stream os = new FileStream(path.FullName, FileMode.Create, FileAccess.Write)) sug.Store(os); using (Stream @is = new FileStream(path.FullName, FileMode.Open, FileAccess.Read)) { sug = new FreeTextSuggester(a, a, 2, (byte)0x20); sug.Load(@is); } assertEquals(2, sug.Count); } }
public void TestIllegalByteDuringBuild() { // Default separator is INFORMATION SEPARATOR TWO // (0x1e), so no input token is allowed to contain it IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle( new Input("foo\u001ebar baz", 50) ); FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random)); try { sug.Build(new InputArrayIterator(keys)); fail("did not hit expected exception"); } catch (ArgumentException /*iae*/) { // expected } }
public void TestEndingHole() { // Just deletes "of" Analyzer a = new TestEndingHoleAnalyzer(); IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle( new Input("wizard of oz", 50) ); FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20); sug.Build(new InputArrayIterator(keys)); assertEquals("wizard _ oz/1.00", ToString(sug.DoLookup("wizard of", 10))); // Falls back to unigram model, with backoff 0.4 times // prop 0.5: assertEquals("oz/0.20", ToString(sug.DoLookup("wizard o", 10))); }
public void TestEmptyString() { IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle( new Input("foo bar bar bar bar", 50) ); Analyzer a = new MockAnalyzer(Random); FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20); sug.Build(new InputArrayIterator(keys)); try { sug.DoLookup("", 10); fail("did not hit exception"); } catch (ArgumentException /*iae*/) { // expected } }
public void TestWiki() { LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false); // Skip header: lfd.NextDoc(); FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random)); sug.Build(new TestWikiInputIterator(this, lfd)); if (VERBOSE) { Console.WriteLine(sug.GetSizeInBytes() + " bytes"); IList <Lookup.LookupResult> results = sug.DoLookup("general r", 10); Console.WriteLine("results:"); foreach (Lookup.LookupResult result in results) { Console.WriteLine(" " + result); } } }
public ComparerAnonymousInnerClassHelper(FreeTextSuggester outerInstance) { this.outerInstance = outerInstance; }
public AnalyzerWrapperAnonymousInnerClassHelper(FreeTextSuggester outerInstance, ReuseStrategy reuseStrategy, Analyzer other) : base(reuseStrategy) { this.outerInstance = outerInstance; this.other = other; }
public void TestRandom() { string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)]; ISet <string> seen = new HashSet <string>(); while (seen.size() < terms.Length) { string token = TestUtil.RandomSimpleString(Random, 1, 5); if (!seen.contains(token)) { terms[seen.size()] = token; seen.add(token); } } Analyzer a = new MockAnalyzer(Random); int numDocs = AtLeast(10); long totTokens = 0; string[][] docs = new string[numDocs][]; for (int i = 0; i < numDocs; i++) { docs[i] = new string[AtLeast(100)]; if (VERBOSE) { Console.Write(" doc " + i + ":"); } for (int j = 0; j < docs[i].Length; j++) { docs[i][j] = GetZipfToken(terms); if (VERBOSE) { Console.Write(" " + docs[i][j]); } } if (VERBOSE) { Console.WriteLine(); } totTokens += docs[i].Length; } int grams = TestUtil.NextInt32(Random, 1, 4); if (VERBOSE) { Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams"); } // Build suggester model: FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20); sug.Build(new TestRandomInputIterator(this, docs)); // Build inefficient but hopefully correct model: List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams); for (int gram = 0; gram < grams; gram++) { if (VERBOSE) { Console.WriteLine("TEST: build model for gram=" + gram); } IDictionary <string, int?> model = new HashMap <string, int?>(); gramCounts.Add(model); foreach (string[] doc in docs) { for (int i = 0; i < doc.Length - gram; i++) { StringBuilder b = new StringBuilder(); for (int j = i; j <= i + gram; j++) { if (j > i) { b.append(' '); } b.append(doc[j]); } string token = b.toString(); int? curCount = model.ContainsKey(token) ? model[token] : null; if (curCount == null) { model.Put(token, 1); } else { model.Put(token, 1 + curCount); } if (VERBOSE) { Console.WriteLine(" add '" + token + "' -> count=" + (model.ContainsKey(token) ? model[token].ToString() : "")); } } } } int lookups = AtLeast(100); for (int iter = 0; iter < lookups; iter++) { string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = GetZipfToken(terms); } // Maybe trim last token; be sure not to create the // empty string: int trimStart; if (tokens.Length == 1) { trimStart = 1; } else { trimStart = 0; } int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length); tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0); int num = TestUtil.NextInt32(Random, 1, 100); StringBuilder b = new StringBuilder(); foreach (string token in tokens) { b.append(' '); b.append(token); } string query = b.toString(); query = query.Substring(1); if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num); } // Expected: List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>(); double backoff = 1.0; seen = new HashSet <string>(); if (VERBOSE) { Console.WriteLine(" compute expected"); } for (int i = grams - 1; i >= 0; i--) { if (VERBOSE) { Console.WriteLine(" grams=" + i); } if (tokens.Length < i + 1) { // Don't have enough tokens to use this model if (VERBOSE) { Console.WriteLine(" skip"); } continue; } if (i == 0 && tokens[tokens.Length - 1].Length == 0) { // Never suggest unigrams from empty string: if (VERBOSE) { Console.WriteLine(" skip unigram priors only"); } continue; } // Build up "context" ngram: b = new StringBuilder(); for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++) { b.append(' '); b.append(tokens[j]); } string context = b.toString(); if (context.Length > 0) { context = context.Substring(1); } if (VERBOSE) { Console.WriteLine(" context='" + context + "'"); } long contextCount; if (context.Length == 0) { contextCount = totTokens; } else { //int? count = gramCounts.get(i - 1).get(context); var gramCount = gramCounts[i - 1]; int?count = gramCount.ContainsKey(context) ? gramCount[context] : null; if (count == null) { // We never saw this context: backoff *= FreeTextSuggester.ALPHA; if (VERBOSE) { Console.WriteLine(" skip: never saw context"); } continue; } contextCount = count.GetValueOrDefault(); } if (VERBOSE) { Console.WriteLine(" contextCount=" + contextCount); } IDictionary <string, int?> model = gramCounts[i]; // First pass, gather all predictions for this model: if (VERBOSE) { Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]); } List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>(); foreach (string term in terms) { if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal)) { if (VERBOSE) { Console.WriteLine(" term=" + term); } if (seen.contains(term)) { if (VERBOSE) { Console.WriteLine(" skip seen"); } continue; } string ngram = (context + " " + term).Trim(); //Integer count = model.get(ngram); int?count = model.ContainsKey(ngram) ? model[ngram] : null; if (count != null) { // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. // This is also the way it is being done in the FreeTextSuggester to work around the issue. Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount))); tmp.Add(lr); if (VERBOSE) { Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value); } } } } // Second pass, trim to only top N, and fold those // into overall suggestions: tmp.Sort(byScoreThenKey); if (tmp.size() > num) { //tmp.subList(num, tmp.size()).clear(); tmp.RemoveRange(num, tmp.size() - num); } foreach (Lookup.LookupResult result in tmp) { string key = result.Key.toString(); int idx = key.LastIndexOf(' '); string lastToken; if (idx != -1) { lastToken = key.Substring(idx + 1); } else { lastToken = key; } if (!seen.contains(lastToken)) { seen.add(lastToken); expected.Add(result); if (VERBOSE) { Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value); } } } backoff *= FreeTextSuggester.ALPHA; } expected.Sort(byScoreThenKey); if (expected.size() > num) { expected.RemoveRange(num, expected.size() - num); } // Actual: IList <Lookup.LookupResult> actual = sug.DoLookup(query, num); if (VERBOSE) { Console.WriteLine(" expected: " + expected); Console.WriteLine(" actual: " + actual); } assertEquals(expected.ToString(), actual.ToString()); } }