private static void Correct(string input, SymSpell symSpell) { //check if input term or similar terms within edit-distance are in dictionary, return results sorted by ascending edit distance, then by descending word frequency var suggestion = symSpell.WordSegmentation(input); //display term and frequency Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString()); }
static void Main(string[] args) { //set parameters const int initialCapacity = 82765; const int maxEditDistance = 0; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Console.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction. //The dictionary may contain vocabulary from different languages. //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly. //You may use SymSpellCompound.CreateDictionaryEntry() to update a (self learning) dictionary incrementally //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). //string path = "big.txt" //if (!SymSpellCompound.CreateDictionary(path,"")) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.WordSegmentation("isit"); string input; Console.WriteLine("Type in a text and hit enter to get word segmentation and correction:"); while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim())) { Correct(input, symSpell); } }
private string FixBadSpelling(string tempHtmlLines, SymSpell spellingEngine) { SpellingCorrection dialog = new SpellingCorrection(); dialog.OriginalChatText.Text = tempHtmlLines.ToString(); //Use SymSpell to fix horrible spelling //Space out tags tempHtmlLines = tempHtmlLines.Replace("*", " * "); tempHtmlLines = tempHtmlLines.Replace(". . . . .", "... "); tempHtmlLines = tempHtmlLines.Replace(". . . .", "... "); tempHtmlLines = tempHtmlLines.Replace(". . .", "... "); tempHtmlLines = tempHtmlLines.Replace(". .", "... "); int postStartIndex = tempHtmlLines.IndexOf(':') + 1; //int endTagIndex = tempHtmlLines.Length - 4; string postSubString = tempHtmlLines.Substring(postStartIndex, (tempHtmlLines.Length - postStartIndex - 5)); int maxEditDistanceLookup = 1; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary) var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string) var suggestions = spellingEngine.LookupCompound(tempHtmlLines, maxEditDistanceLookup); var axy = suggestions[0]; var dog = spellingEngine.WordSegmentation(postSubString); string fixedLine = dog.correctedString; string fixedStuff = (tempHtmlLines.Substring(0, (postStartIndex)) + " " + fixedLine).Replace(" * ", "*"); dialog.SuggestedChatTextTextBox.Text = fixedStuff; dialog.ShowDialog(); if (dialog.DialogResult.HasValue && dialog.DialogResult.Value) { return(dialog.SuggestedChatTextTextBox.Text + "\r\n"); } else { return(fixedStuff + "\r\n"); } }
static void Main(string[] args) { if (args.Length > 2) { Console.Error.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //parameters int initialCapacity = 82765; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file //dictionaryType string dictionaryType = args[0].ToLower(); if ("load.create".IndexOf(dictionaryType) == -1) { Console.Error.WriteLine("Error in parameter 1"); return; } //dictionaryPath string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1]; //prefix length (optional parameter) int offset = 0; string lookupType = ""; int prefixLength = 7; if (!int.TryParse(args[2], out prefixLength)) { prefixLength = 7; } else { offset = 1; } //lookupType if (args.Length > 2 + offset) { lookupType = args[2 + offset].ToLower(); if ("lookup.lookupcompound.wordsegment".IndexOf(lookupType) == -1) { Console.Error.WriteLine("Error in parameter " + (3 + offset).ToString()); return; } } //maxEditDistance int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation if (args.Length > 3 + offset) { if (!int.TryParse(args[3 + offset], out maxEditDistanceDictionary)) { Console.Error.WriteLine("Error in parameter " + (4 + offset).ToString()); return; } } //output stats bool outputStats = false;//false, true if (args.Length > 4 + offset) { if (!bool.TryParse(args[4 + offset], out outputStats)) { Console.Error.WriteLine("Error in parameter " + (5 + offset).ToString()); return; } } //verbosity var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All if (args.Length > 5 + offset) { if (!Enum.TryParse(args[5 + offset], true, out suggestionVerbosity)) { Console.Error.WriteLine("Error in parameter " + (6 + offset).ToString()); return; } } //create object var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength); //load dictionary switch (dictionaryType) { case "load": if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.Error.WriteLine("File not found!"); return; } break; case "create": if (!symSpell.CreateDictionary(dictionaryPath)) { Console.Error.WriteLine("File not found!"); return; } break; default: break; } stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); //lookup suggestions for single-word input strings string inputTerm; while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim())) { switch (lookupType) { case "lookup": var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceDictionary, true); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "lookupcompound": var suggestions2 = symSpell.LookupCompound(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions2) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "wordsegment": var suggestions3 = symSpell.WordSegmentation(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions3) { if (outputStats) { Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString()); } else { Console.WriteLine(suggestion.correctedString); } } break; default: break; } } } else { //PrefixLength is number //help Console.WriteLine("SymSpell.CommandLine DictionaryType DictionaryPath [PrefixLength] LookupType [MaxEditDistance] [OutputStats] [Verbosity]"); Console.WriteLine(); Console.WriteLine("DictionaryType=load|create"); Console.WriteLine(" load: load dictionary from dictionary file"); Console.WriteLine(" create: create dictionary from text corpus"); Console.WriteLine("DictionaryPath: path to dictionary/corpus file"); Console.WriteLine("PrefixLength: default=7 (speed/memory consumption trade-off)"); //dictionary param Console.WriteLine(" 5: low memory, slow lookup"); Console.WriteLine(" 6: medium memory, medium lookup"); Console.WriteLine(" 7: high memory, fast lookup"); //lookup intended for correction of single word //lookupcompound intended for correction of multiple words, it can insert only a single space per token, faster than wordsegmentation //wordsegmentation intended for segmentation and correction of multiple words, it can insert multiple spaces per token, slower than lookupcompound Console.WriteLine("LookupType=lookup|lookupcompound|wordsegment"); Console.WriteLine(" lookup: correct single word"); Console.WriteLine(" lookupcompound: correct multiple-word string (supports splitting/merging)"); Console.WriteLine(" wordsegment: word segment and correct input string"); Console.WriteLine("MaxEditDistance: default=2 (0: no correction, word segmentation only)"); Console.WriteLine("OutputStats=false|true"); Console.WriteLine(" false: only corrected string"); Console.WriteLine(" true: corrected string, edit distance, word frequency/probability"); Console.WriteLine("Verbosity=top|closest|all"); //no effect for lookupcompound and wordsegment Console.WriteLine(" top: Top suggestion"); Console.WriteLine(" closest: All suggestions of smallest edit distance found"); Console.WriteLine(" all: All suggestions within maxEditDistance"); Console.WriteLine(); } }
static void Main(string[] args) { //Console.WriteLine("Hello World!"); //create object int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); //load dictionary string baseDirectory = AppDomain.CurrentDomain.BaseDirectory; //string dictionaryPath= baseDirectory + "frequency_dictionary_en_82_765.txt"; Console.WriteLine(baseDirectory); string dictionaryPath = baseDirectory + "../../../frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.WriteLine("File not found!"); //press any key to exit program Console.ReadKey(); return; } int i = 0; string contentUndetermine = ""; string contentDate = ""; string contentScore = ""; string contentLineOfReviews = ""; /* * http://www.vcskicks.com/read_text_file.php * here is a method provided by vcskicks.com which allow user to read the whole document at once * and pass all the content as a single string * decide not to read the whole document as pass all the content as a string since the string may be so huge * */ // string path = "C:/Users/kongwh/Desktop/test2/t1.txt"; // StreamReader textFile = new StreamReader(path); // string input = textFile.ReadToEnd(); // textFile.Close(); /* * https://www.tutorialspoint.com/csharp/csharp_text_files.htm * here is a method provided by tutorial point which read a document line by line * and pass each line as a string * * and write string to a document */ string line = ""; //https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/main-and-command-args/command-line-arguments string nameOfadjustedDocument = args[0]; //https://www.geeksforgeeks.org/c-sharp-insert-method/ nameOfadjustedDocument = nameOfadjustedDocument.Insert(nameOfadjustedDocument.Length - 4, "_adj"); using (StreamWriter sw = new StreamWriter(nameOfadjustedDocument)){ //using(StreamWriter sw = new StreamWriter(adj.txt)){ using (StreamReader sr = new StreamReader(args[0])) { //using (StreamReader sr = new StreamReader("t1.txt")) { while ((line = sr.ReadLine()) != null) { //Console.WriteLine("line: " + line); //Console.WriteLine("line length: " + line.Length); contentUndetermine = line; if (isFormatOfDate(contentUndetermine)) { //contentUndetermine is a date contentDate = contentUndetermine; i = 1; Console.WriteLine(contentDate); sw.WriteLine(contentDate); } else if (i == 1) { //contentUndetermine is a score contentScore = contentUndetermine; Console.WriteLine(contentScore); sw.WriteLine(contentScore); i = 2; } else { string onlyEnglishAndSpace = ""; string notEnglishAndSpace = ""; string adjustedReviewLine = ""; contentLineOfReviews = contentUndetermine; foreach (char charInLine in contentLineOfReviews) { if (charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine)) { onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString(); } else { notEnglishAndSpace = charInLine.ToString(); if (onlyEnglishAndSpace.Equals("")) { adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace; } else { //word segmentation and correction for multi-word input strings with/without spaces var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace); adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace; } onlyEnglishAndSpace = ""; notEnglishAndSpace = ""; } } if (!onlyEnglishAndSpace.Equals("")) { //word segmentation and correction for multi-word input strings with/without spaces var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace); adjustedReviewLine = adjustedReviewLine + suggestion.correctedString; } Console.WriteLine(adjustedReviewLine); sw.WriteLine(adjustedReviewLine); i++; } } } } //----------------------------sample input---------------------------------------- // //sample input // string input="January 25, 2019\n5\ngooood😋\nJune 25, 2019\n1\nsofarsogood\n"; // //Console.WriteLine("input length: " + input.Length); // foreach(char c in input){ // contentUndetermine = contentUndetermine + c.ToString( ); // //Console.WriteLine("current: " + contentUndetermine); // if(contentUndetermine.Contains("\r") || contentUndetermine.Contains("\n")){ // if(isFormatOfDate(contentUndetermine)){ // //contentUndetermine is a date // contentDate = contentUndetermine; // Console.WriteLine(contentDate); // i = 1; // }else if(i == 1){ // //contentUndetermine is a score // contentScore = contentUndetermine; // Console.WriteLine(contentScore); // i = 2; // }else{ // string onlyEnglishAndSpace = ""; // string notEnglishAndSpace = ""; // string adjustedReviewLine = ""; // contentLineOfReviews = contentUndetermine; // foreach(char charInLine in contentLineOfReviews){ // if(charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine)){ // onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString(); // }else{ // notEnglishAndSpace = charInLine.ToString(); // if(onlyEnglishAndSpace.Equals("")){ // adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace; // }else{ // //word segmentation and correction for multi-word input strings with/without spaces // var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace); // adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace; // } // onlyEnglishAndSpace = ""; // notEnglishAndSpace = ""; // } // } // // //word segmentation and correction for multi-word input strings with/without spaces // // var suggestion = symSpell.WordSegmentation(contentLineOfReviews); // // //display term and edit distance // // Console.WriteLine(suggestion.correctedString); // //Console.WriteLine(contentLineOfReviews); // Console.WriteLine(adjustedReviewLine); // adjustedReviewLine = ""; // i++; // } // //clear the content // contentUndetermine = ""; // } // } //----------------------------sample input ends---------------------------------------- //----------------functions used for spell check provided by symSpell------------------ // //word segmentation and correction for multi-word input strings with/without spaces // var suggestion1 = symSpell.WordSegmentation(input); // //display term and edit distance // Console.WriteLine(suggestion1.correctedString); // //lookup suggestions for single-word input strings // string inputTerm="goodandnicedesign"; // //string inputTerm=suggestion1.correctedString; // int maxEditDistanceLookup = 1; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary) // var suggestionVerbosity = SymSpell.Verbosity.Closest; //Top, Closest, All // var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceLookup); // //lookup suggestions for multi-word input strings (supports compound splitting & merging) // //inputTerm="whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him"; // maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string) // suggestions = symSpell.LookupCompound(inputTerm, maxEditDistanceLookup); // //display suggestions, edit distance and term frequency // foreach (var suggestion in suggestions) // { // Console.WriteLine(suggestion.term); // } //press any key to exit program //Console.ReadKey(); }
public (string segmentedString, string correctedString, int distanceSum, decimal probabilityLogSum) segmentText(string text, int distance) { var suggestion = symSpell.WordSegmentation(text, distance); return(suggestion); }