public string SpellCheck(string input) { // empty string if (input.Equals("")) { return(input); } List <SymSpell.SuggestItem> suggestions; // check for mulit word spell check if (input.Split(" ").Length == 1) { // single word spell check //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary) int maxEditDistanceLookup = 2; var suggestionVerbosity = SymSpell.Verbosity.Closest; suggestions = spellChecker.Lookup(input, suggestionVerbosity, maxEditDistanceLookup); } else { // multi word spell check //max edit distance per lookup (per single word, not per whole input string) int maxEditDistanceLookup = 2; suggestions = spellChecker.LookupCompound(input, maxEditDistanceLookup); } // return first suggestion if exists, else return back input return(suggestions.Count != 0 ? suggestions[0].term : input); }
//Load a frequency dictionary or create a frequency dictionary from a text corpus public static void Main(string[] args) { //set parameters const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Console.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction. //The dictionary may contain vocabulary from different languages. //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly. //You may use SymSpellCompound.CreateDictionaryEntry() to update a (self learning) dictionary incrementally //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). //string path = "big.txt" //if (!SymSpellCompound.CreateDictionary(path,"")) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.LookupCompound("isit"); string input; Console.WriteLine("Type in a word or phrase and hit enter to get suggestions:"); while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim())) { Correct(input, symSpell); } }
private static void Correct(string input, SymSpell symSpell) { List <SymSpell.SuggestItem> suggestions = null; //check if input term or similar terms within edit-distance are in dictionary, return results sorted by ascending edit distance, then by descending word frequency suggestions = symSpell.LookupCompound(input, symSpell.MaxDictionaryEditDistance); //display term and frequency foreach (var suggestion in suggestions) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } }
private string FixBadSpelling(string tempHtmlLines, SymSpell spellingEngine) { SpellingCorrection dialog = new SpellingCorrection(); dialog.OriginalChatText.Text = tempHtmlLines.ToString(); //Use SymSpell to fix horrible spelling //Space out tags tempHtmlLines = tempHtmlLines.Replace("*", " * "); tempHtmlLines = tempHtmlLines.Replace(". . . . .", "... "); tempHtmlLines = tempHtmlLines.Replace(". . . .", "... "); tempHtmlLines = tempHtmlLines.Replace(". . .", "... "); tempHtmlLines = tempHtmlLines.Replace(". .", "... "); int postStartIndex = tempHtmlLines.IndexOf(':') + 1; //int endTagIndex = tempHtmlLines.Length - 4; string postSubString = tempHtmlLines.Substring(postStartIndex, (tempHtmlLines.Length - postStartIndex - 5)); int maxEditDistanceLookup = 1; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary) var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string) var suggestions = spellingEngine.LookupCompound(tempHtmlLines, maxEditDistanceLookup); var axy = suggestions[0]; var dog = spellingEngine.WordSegmentation(postSubString); string fixedLine = dog.correctedString; string fixedStuff = (tempHtmlLines.Substring(0, (postStartIndex)) + " " + fixedLine).Replace(" * ", "*"); dialog.SuggestedChatTextTextBox.Text = fixedStuff; dialog.ShowDialog(); if (dialog.DialogResult.HasValue && dialog.DialogResult.Value) { return(dialog.SuggestedChatTextTextBox.Text + "\r\n"); } else { return(fixedStuff + "\r\n"); } }
static void Main(string[] args) { if (args.Length > 2) { Console.Error.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //parameters int initialCapacity = 82765; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file //dictionaryType string dictionaryType = args[0].ToLower(); if ("load.create".IndexOf(dictionaryType) == -1) { Console.Error.WriteLine("Error in parameter 1"); return; } //dictionaryPath string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1]; //prefix length (optional parameter) int offset = 0; string lookupType = ""; int prefixLength = 7; if (!int.TryParse(args[2], out prefixLength)) { prefixLength = 7; } else { offset = 1; } //lookupType if (args.Length > 2 + offset) { lookupType = args[2 + offset].ToLower(); if ("lookup.lookupcompound.wordsegment".IndexOf(lookupType) == -1) { Console.Error.WriteLine("Error in parameter " + (3 + offset).ToString()); return; } } //maxEditDistance int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation if (args.Length > 3 + offset) { if (!int.TryParse(args[3 + offset], out maxEditDistanceDictionary)) { Console.Error.WriteLine("Error in parameter " + (4 + offset).ToString()); return; } } //output stats bool outputStats = false;//false, true if (args.Length > 4 + offset) { if (!bool.TryParse(args[4 + offset], out outputStats)) { Console.Error.WriteLine("Error in parameter " + (5 + offset).ToString()); return; } } //verbosity var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All if (args.Length > 5 + offset) { if (!Enum.TryParse(args[5 + offset], true, out suggestionVerbosity)) { Console.Error.WriteLine("Error in parameter " + (6 + offset).ToString()); return; } } //create object var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength); //load dictionary switch (dictionaryType) { case "load": if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.Error.WriteLine("File not found!"); return; } break; case "create": if (!symSpell.CreateDictionary(dictionaryPath)) { Console.Error.WriteLine("File not found!"); return; } break; default: break; } stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); //lookup suggestions for single-word input strings string inputTerm; while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim())) { switch (lookupType) { case "lookup": var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceDictionary, true); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "lookupcompound": var suggestions2 = symSpell.LookupCompound(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions2) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "wordsegment": var suggestions3 = symSpell.WordSegmentation(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions3) { if (outputStats) { Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString()); } else { Console.WriteLine(suggestion.correctedString); } } break; default: break; } } } else { //PrefixLength is number //help Console.WriteLine("SymSpell.CommandLine DictionaryType DictionaryPath [PrefixLength] LookupType [MaxEditDistance] [OutputStats] [Verbosity]"); Console.WriteLine(); Console.WriteLine("DictionaryType=load|create"); Console.WriteLine(" load: load dictionary from dictionary file"); Console.WriteLine(" create: create dictionary from text corpus"); Console.WriteLine("DictionaryPath: path to dictionary/corpus file"); Console.WriteLine("PrefixLength: default=7 (speed/memory consumption trade-off)"); //dictionary param Console.WriteLine(" 5: low memory, slow lookup"); Console.WriteLine(" 6: medium memory, medium lookup"); Console.WriteLine(" 7: high memory, fast lookup"); //lookup intended for correction of single word //lookupcompound intended for correction of multiple words, it can insert only a single space per token, faster than wordsegmentation //wordsegmentation intended for segmentation and correction of multiple words, it can insert multiple spaces per token, slower than lookupcompound Console.WriteLine("LookupType=lookup|lookupcompound|wordsegment"); Console.WriteLine(" lookup: correct single word"); Console.WriteLine(" lookupcompound: correct multiple-word string (supports splitting/merging)"); Console.WriteLine(" wordsegment: word segment and correct input string"); Console.WriteLine("MaxEditDistance: default=2 (0: no correction, word segmentation only)"); Console.WriteLine("OutputStats=false|true"); Console.WriteLine(" false: only corrected string"); Console.WriteLine(" true: corrected string, edit distance, word frequency/probability"); Console.WriteLine("Verbosity=top|closest|all"); //no effect for lookupcompound and wordsegment Console.WriteLine(" top: Top suggestion"); Console.WriteLine(" closest: All suggestions of smallest edit distance found"); Console.WriteLine(" all: All suggestions within maxEditDistance"); Console.WriteLine(); } }
static SearchResult[] Search( // input query string query, // trie for prefix/infix matching PatriciaSuffixTrie <string> trie, SymSpell symSpell, // inverted index Dictionary <int, HashSet <int> > inverter, // word -> its order Dictionary <string, int> dict, // collection of documents List <string> documents, // limit int limit ) { var aggregated = new Dictionary <int, SearchResult>(); var tokens = new LinkedList <string>(); foreach (var word in query.ToLower().Split(' ')) { tokens.AddLast(word); } while (tokens.Count > 0) { // pop_front the queue var word = tokens.First.Value; tokens.RemoveFirst(); // pipeline: // 1. find exact matches first int tmp; if (dict.TryGetValue(word, out tmp)) { var docs = inverter[tmp]; foreach (var doc in docs) { SearchResult tempSearchResult; // add to aggregated result if (!aggregated.TryGetValue(doc, out tempSearchResult)) { aggregated[doc] = new SearchResult(documents[doc], 1);; } else { tempSearchResult.score++; } } continue; } // if no exact match then search for prefix suggestions (for prefix <= 3) if (word.Length <= 3) // find prefix matches { string suggestion = null; // take 1 suggestion first foreach (var suggest in trie.Retrieve(word)) { suggestion = suggest; Console.WriteLine("Prefix matched: " + suggestion); break; } if (suggestion != null) { // push_front tokens.AddFirst(suggestion); continue; } } // if no prefix suggestion found then correct spelling var lookupResult = symSpell.LookupCompound(word)[0].term.Split(' '); for (int i = lookupResult.Length - 1; i >= 0; --i) { tokens.AddFirst(lookupResult[i]); } } // then sort?? return(aggregated.Values.ToArray()); }
public List <SymSpell.SuggestItem> correctText(string text, int distance) { var suggestions = symSpell.LookupCompound(text, distance); return(suggestions); }
private static void Experimento3() { string strPath = @"D:\json\"; string[] fileEntries = Directory.GetFiles(strPath); StringBuilder OCROriginal = new StringBuilder(); string fileName = @"D:\cuantificacion\Experimentos\experimento3.xlsx"; string connectionString = String.Format(@"Provider=Microsoft.ACE.OLEDB.12.0;" + "Data Source={0};Extended Properties='Excel 12.0;HDR=YES;IMEX=0'", fileName); EditDistanceLength editDistance = new EditDistanceLength(); const int initialCapacity = 82765; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>(); foreach (string path in fileEntries) { string jsonText = File.ReadAllText(path, Encoding.Default); var response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText); foreach (var respuestas in response.Responses) { var annotation = respuestas.FullTextAnnotation; if (annotation != null) { OCROriginal.Append(annotation.Text); } } } symSpell.LoadDictionary(@"D:\load8.txt", 0, 1); List <SymSpell.SuggestItem> suggestions = symSpell.LookupCompound(OCROriginal.ToString(), 2); var arraySymspell = suggestions[0].ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Split(' '); var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' '); int j = 0, k = 0; double similarity; for (int i = 0; i < arraySymspell.Length; i++) { if (j == arrayOCROriginal.Length) { break; } similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j].ToLower()); ExperimentSpell exp1 = new ExperimentSpell(); if (similarity == 1) { exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; j++; } else { if (similarity >= .4) { exp1.correction = "Corregida"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; j++; } else { if (similarity > 0.06) { exp1.correction = "Espacios"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (j > 0) { similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j - 1].ToLower()); } else { similarity = 0; } if (similarity == 1) { j--; exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity >= .4) { j--; exp1.correction = "Corregida"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity > 0.06) { j--; exp1.correction = "Espacios"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (j + 1 < arrayOCROriginal.Length) { similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j + 1].ToLower()); } else { similarity = 0; } if (similarity == 1) { j++; exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity >= .4) { j++; exp1.correction = "Corregida"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity > 0.06) { j++; exp1.correction = "Espacios"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { exp1.correction = "Error"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; j++; } } } } } } } } } excelMatrix.Add(k++, exp1); } CreateExcelFileExperimento(excelMatrix, "3"); }
private static void Experimento2_1() { Stopwatch stopWatch = new Stopwatch(); string strPath = @"D:\json\"; string[] fileEntries = Directory.GetFiles(strPath); StringBuilder OCROriginal = new StringBuilder(); string fileName = @"D:\cuantificacion\Experimentos\experimento2.xlsx"; string connectionString = String.Format(@"Provider=Microsoft.ACE.OLEDB.12.0;" + "Data Source={0};Extended Properties='Excel 12.0;HDR=YES;IMEX=0'", fileName); EditDistanceLength editDistance = new EditDistanceLength(); //Symspell parameters const int initialCapacity = 82765; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>(); foreach (string path in fileEntries) { string jsonText = File.ReadAllText(path, Encoding.Default); var response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText); foreach (var respuestas in response.Responses) { var annotation = respuestas.FullTextAnnotation; if (annotation != null) { OCROriginal.Append(annotation.Text); } } } stopWatch.Start(); //load symspell dictionary default symSpell.LoadDictionary(@"D:\load8.txt", 0, 1); //process symspell List <SymSpell.SuggestItem> suggestions = symSpell.LookupCompound(OCROriginal.ToString(), 2); stopWatch.Stop(); var arraySymspell = suggestions[0].ToString().Replace("\n", " ").Replace("}", "").Split(' '); var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' '); int j = 0, k = 0; for (int i = 0; i < arraySymspell.Length; i++) { ExperimentSpell exp1 = new ExperimentSpell(); exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; if (j < arrayOCROriginal.Length) { exp1.original = arrayOCROriginal[j]; } else { exp1.original = ""; } j++; excelMatrix.Add(k++, exp1); } CreateExcelFileExperimento(excelMatrix, "2"); }
//Load a frequency dictionary or create a frequency dictionary from a text corpus public static void Main(string[] args) { var path = AppDomain.CurrentDomain.BaseDirectory + @"all-suggests-cleaned.txt"; Console.Write("Creating trie ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); var wordToIndex = new Dictionary <string, int>(); var wordFrequency = new Dictionary <string, int>(); var phraseList = new List <string>(); int count = 0; using (StreamReader sr = new StreamReader(path)) { while (sr.Peek() >= 0) { var s = sr.ReadLine(); phraseList.Add(s.Trim()); var tokens = s.Trim().Split(' '); for (int i = 0; i < tokens.Length; ++i) { int index = 0, freq = 0; if (!wordToIndex.TryGetValue(tokens[i], out index)) { wordToIndex[tokens[i]] = count++; } if (!wordFrequency.TryGetValue(tokens[i], out freq)) { wordFrequency[tokens[i]] = 1; } else { wordFrequency[tokens[i]] = freq + 1; } } } } long memDeltaForStoringValues = GC.GetTotalMemory(true) - memSize; Console.WriteLine("Memory for storing value: " + memDeltaForStoringValues + ". Going to add to trie"); var trie = new UkkonenTrie <int>(1); int value = 0; foreach (var phrase in phraseList) { trie.Add(phrase, value++); } //Load a frequency dictionary stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; Console.WriteLine("Done in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB. Token count: " + wordToIndex.Count); // spell checker var spellChecker = new SymSpell(wordToIndex.Count, 2); foreach (var entry in wordFrequency) { spellChecker.CreateDictionaryEntry(entry.Key, entry.Value); } while (true) { Console.WriteLine("Input string to search:"); var s = Console.ReadLine(); if (s == "exit") { return; } var normalized = s.ToLower(); var suggests = spellChecker.LookupCompound(normalized, 2); // lookup in trie var results = trie.Retrieve(normalized); var resultCount = 0; foreach (var result in results) { Console.WriteLine("--> " + phraseList[result]); resultCount++; } var suggest = suggests[0].term; foreach (var sug in suggests) { Console.WriteLine("Can search for: " + sug.term); } if (suggest != normalized) { Console.WriteLine("Did you mean: " + suggest + "?"); } Console.WriteLine(String.Format("Found {0} result", resultCount)); } }