public void LookupShouldReturnMostFrequent() { var symSpell = new SymSpell(); symSpell.CreateDictionaryEntry("steama", 4); symSpell.CreateDictionaryEntry("steamb", 6); symSpell.CreateDictionaryEntry("steamc", 2); var result = symSpell.Lookup("steam", SymSpell.Verbosity.Top, 2); Assert.AreEqual(1, result.Count); Assert.AreEqual("steamb", result[0].term); Assert.AreEqual(6, result[0].count); }
public void VerbosityShouldControlLookupResults() { var symSpell = new SymSpell(); symSpell.CreateDictionaryEntry("steam", 1); symSpell.CreateDictionaryEntry("steams", 2); symSpell.CreateDictionaryEntry("steem", 3); var result = symSpell.Lookup("steems", SymSpell.Verbosity.Top, 2); Assert.AreEqual(1, result.Count); result = symSpell.Lookup("steems", SymSpell.Verbosity.Closest, 2); Assert.AreEqual(2, result.Count); result = symSpell.Lookup("steems", SymSpell.Verbosity.All, 2); Assert.AreEqual(3, result.Count); }
public void LookupShouldNotReturnLowCountWord() { var symSpell = new SymSpell(16, 2, 7, 10); symSpell.CreateDictionaryEntry("pawn", 1); var result = symSpell.Lookup("pawn", SymSpell.Verbosity.Top, 0); Assert.AreEqual(0, result.Count); }
public void AddAdditionalCountsShouldNotOverflow() { var symSpell = new SymSpell(); var word = "hello"; symSpell.CreateDictionaryEntry(word, long.MaxValue - 10); var result = symSpell.Lookup(word, SymSpell.Verbosity.Top); long count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(long.MaxValue - 10, count); symSpell.CreateDictionaryEntry(word, 11); result = symSpell.Lookup(word, SymSpell.Verbosity.Top); count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(long.MaxValue, count); }
public void AddAdditionalCountsShouldIncreaseCount() { var symSpell = new SymSpell(); var word = "hello"; symSpell.CreateDictionaryEntry(word, 11); var result = symSpell.Lookup(word, SymSpell.Verbosity.Top); long count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(11, count); symSpell.CreateDictionaryEntry(word, 3); result = symSpell.Lookup(word, SymSpell.Verbosity.Top); count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(11 + 3, count); }
static void Main(string[] args) { var path = AppDomain.CurrentDomain.BaseDirectory + "real-suggests.txt"; var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "real-suggests.index"; // var path = AppDomain.CurrentDomain.BaseDirectory + "all-suggests-cleaned.txt"; // var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "all-suggests-cleaned.index"; // var path = AppDomain.CurrentDomain.BaseDirectory + "small-suggests.txt"; // var indexerPath = AppDomain.CurrentDomain.BaseDirectory + "small-suggests.index"; var dict = new Dictionary <string, int>(); var frequency = new Dictionary <string, int>(); var inverter = new Dictionary <int, HashSet <int> >(); var documents = new List <string>(); // try indexing by using invert index var count = 0; var stringIndex = 0; Console.WriteLine("Reading and indexing ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); var trie = new PatriciaSuffixTrie <string>(1); using (var sr = new StreamReader(path)) { string s = null; while ((s = sr.ReadLine()) != null) { documents.Add(s); foreach (var word in s.Split(' ')) { int tmp; if (!dict.TryGetValue(word, out tmp)) { dict[word] = ++count; frequency[word] = 1; } else { frequency[word]++; } HashSet <int> tmpStringIndexes; if (!inverter.TryGetValue(dict[word], out tmpStringIndexes)) { tmpStringIndexes = new HashSet <int>(); inverter[count] = tmpStringIndexes; } tmpStringIndexes.Add(stringIndex); } stringIndex++; } } Console.WriteLine("Adding to completion dict..."); memSize = GC.GetTotalMemory(true); foreach (var entry in dict) { trie.Add(entry.Key, entry.Key); } Console.WriteLine("Add to completion dict: " + ((GC.GetTotalMemory(true) - memSize) / 1024 / 1024.0).ToString("N0") + " MB. Token count: " + dict.Count); stopWatch = new Stopwatch(); stopWatch.Start(); Console.WriteLine("Adding to symSpell for fast spellCheck"); memSize = GC.GetTotalMemory(true); // dictionary for symspell var spellChecker = new SymSpell(dict.Count, 2); foreach (var entry in frequency) { spellChecker.CreateDictionaryEntry(entry.Key, entry.Value); } Console.WriteLine("Spell dictionary constructed. " + ((GC.GetTotalMemory(true) - memSize) / 1024 / 1024.0).ToString("N0") + "MB, " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms. Tokens:" + frequency.Count); stopWatch = new Stopwatch(); stopWatch.Start(); Console.WriteLine("Saving index ..."); stopWatch.Stop(); long byteCount = SaveIndex(indexerPath, trie, spellChecker, inverter, dict, documents); stopWatch = new Stopwatch(); Console.WriteLine("File saved: " + byteCount + " bytes. Time ellapsed: " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms"); stopWatch.Start(); Console.WriteLine("Searching ..."); var hits = Search("342 cw", trie, spellChecker, inverter, dict, documents, 10); stopWatch.Stop(); var timeEllapsed = stopWatch.Elapsed.TotalMilliseconds.ToString("0.0"); // foreach(var hit in hits) { // Console.WriteLine("--> " + hit.value); // } Console.WriteLine("Searching done." + timeEllapsed + "ms. Hits:" + hits.Length); }
public void parse(string line, bool input, Pattern p, int nline) { char status = 's'; //int ind =0; string holder = ""; string _operator = ""; line += ' '; if (!input) { p.responses.Add(new List <Action>()); } var vTarget = input ? p.input : p.responses[p.responses.Count - 1]; for (int ind = 0; ind < line.Length; ind++) { char c = line[ind]; if (c == ' ' && status == 's') { continue; } if (c == ' ' && (status == 'a' || status == 'o')) { status = 's'; if (_operator.Equals("")) { _operator = " "; } if (!_operator.Equals("$") && !_operator.Equals("$$")) { vTarget.Add(new Action(_operator, holder, nline, ind + 1)); } else if (_operator.Equals("$$")) { p.contextRouter = holder.Trim(); Console.WriteLine("SETTING route as {0}", holder); } else { Console.WriteLine("Route as {0}", holder); p.context = holder.Trim(); } if (!_operator.Equals("*") && input) { p.tranferTerms++; } if (input) { corrector.CreateDictionaryEntry(holder, 3); } if (_operator == "->" || _operator == "@") { p.hasSubject = true; } holder = ""; _operator = ""; continue; } if (c == ' ' && status == 'w') { status = 's'; if (!_operator.Equals("$") && !_operator.Equals("$$")) { vTarget.Add(new Action(" ", holder, nline, ind + 1)); } else if (_operator.Equals("$$")) { p.contextRouter = holder.Trim(); Console.WriteLine("SETTING route as {0}", holder); } else { Console.WriteLine("Route as {0}", holder); p.context = holder.Trim(); } if (!_operator.Equals("*") && input) { p.tranferTerms++; } if (input) { corrector.CreateDictionaryEntry(holder, 3); } holder = ""; _operator = ""; continue; } if (c == '#') { break; } if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || (status == 'n' && c != '_') || status == 'w' || status == 'a') { if (status == 's') { status = 'w'; } if (status == 'w') { holder += c; continue; } if (status == 'o') { status = 'a'; } if (status == 'a') { holder += c; continue; } if (status == 'n') { _operator += c; } } else { if (c == '_') { if (status == 's') { status = 'n'; } else if (status == 'n') { status = 'a'; } continue; } else { if (status == 's') { status = 'o'; } if (status == 'o') { _operator += c; } } } } }
//Load a frequency dictionary or create a frequency dictionary from a text corpus public static void Main(string[] args) { var path = AppDomain.CurrentDomain.BaseDirectory + @"all-suggests-cleaned.txt"; Console.Write("Creating trie ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); var wordToIndex = new Dictionary <string, int>(); var wordFrequency = new Dictionary <string, int>(); var phraseList = new List <string>(); int count = 0; using (StreamReader sr = new StreamReader(path)) { while (sr.Peek() >= 0) { var s = sr.ReadLine(); phraseList.Add(s.Trim()); var tokens = s.Trim().Split(' '); for (int i = 0; i < tokens.Length; ++i) { int index = 0, freq = 0; if (!wordToIndex.TryGetValue(tokens[i], out index)) { wordToIndex[tokens[i]] = count++; } if (!wordFrequency.TryGetValue(tokens[i], out freq)) { wordFrequency[tokens[i]] = 1; } else { wordFrequency[tokens[i]] = freq + 1; } } } } long memDeltaForStoringValues = GC.GetTotalMemory(true) - memSize; Console.WriteLine("Memory for storing value: " + memDeltaForStoringValues + ". Going to add to trie"); var trie = new UkkonenTrie <int>(1); int value = 0; foreach (var phrase in phraseList) { trie.Add(phrase, value++); } //Load a frequency dictionary stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; Console.WriteLine("Done in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB. Token count: " + wordToIndex.Count); // spell checker var spellChecker = new SymSpell(wordToIndex.Count, 2); foreach (var entry in wordFrequency) { spellChecker.CreateDictionaryEntry(entry.Key, entry.Value); } while (true) { Console.WriteLine("Input string to search:"); var s = Console.ReadLine(); if (s == "exit") { return; } var normalized = s.ToLower(); var suggests = spellChecker.LookupCompound(normalized, 2); // lookup in trie var results = trie.Retrieve(normalized); var resultCount = 0; foreach (var result in results) { Console.WriteLine("--> " + phraseList[result]); resultCount++; } var suggest = suggests[0].term; foreach (var sug in suggests) { Console.WriteLine("Can search for: " + sug.term); } if (suggest != normalized) { Console.WriteLine("Did you mean: " + suggest + "?"); } Console.WriteLine(String.Format("Found {0} result", resultCount)); } }