private static void Correct(string input, SymSpell symSpell)
        {
            //check if input term or similar terms within edit-distance are in dictionary, return results sorted by ascending edit distance, then by descending word frequency
            var suggestion = symSpell.WordSegmentation(input);

            //display term and frequency
            Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString());
        }
        static void Main(string[] args)
        {
            //set parameters
            const int initialCapacity = 82765;
            const int maxEditDistance = 0;
            const int prefixLength    = 7;
            SymSpell  symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);

            Console.Write("Creating dictionary ...");
            long      memSize   = GC.GetTotalMemory(true);
            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();

            //Load a frequency dictionary
            //wordfrequency_en.txt  ensures high correction quality by combining two data sources:
            //Google Books Ngram data  provides representative word frequencies (but contains many entries with spelling errors)
            //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies)
            string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project

            //string path = "../../frequency_dictionary_en_82_765.txt";  //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)
            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return;
            }

            //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt )
            //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction.
            //The dictionary may contain vocabulary from different languages.
            //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly.
            //You may use SymSpellCompound.CreateDictionaryEntry() to update a (self learning) dictionary incrementally
            //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry().
            //string path = "big.txt"
            //if (!SymSpellCompound.CreateDictionary(path,"")) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));

            stopWatch.Stop();
            long memDelta = GC.GetTotalMemory(true) - memSize;

            Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, "
                              + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString()
                              + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
                              + (memDelta / 1024 / 1024.0).ToString("N0") + " MB");

            //warm up
            var result = symSpell.WordSegmentation("isit");

            string input;

            Console.WriteLine("Type in a text and hit enter to get word segmentation and correction:");
            while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim()))
            {
                Correct(input, symSpell);
            }
        }
        private string FixBadSpelling(string tempHtmlLines, SymSpell spellingEngine)
        {
            SpellingCorrection dialog = new SpellingCorrection();

            dialog.OriginalChatText.Text = tempHtmlLines.ToString();

            //Use SymSpell to fix horrible spelling

            //Space out tags
            tempHtmlLines = tempHtmlLines.Replace("*", " * ");
            tempHtmlLines = tempHtmlLines.Replace(". . . . .", "... ");
            tempHtmlLines = tempHtmlLines.Replace(". . . .", "... ");
            tempHtmlLines = tempHtmlLines.Replace(". . .", "... ");
            tempHtmlLines = tempHtmlLines.Replace(". .", "... ");
            int postStartIndex = tempHtmlLines.IndexOf(':') + 1;
            //int endTagIndex = tempHtmlLines.Length - 4;
            string postSubString = tempHtmlLines.Substring(postStartIndex, (tempHtmlLines.Length - postStartIndex - 5));

            int maxEditDistanceLookup = 1;                      //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary)
            var suggestionVerbosity   = SymSpell.Verbosity.Top; //Top, Closest, All

            maxEditDistanceLookup = 2;                          //max edit distance per lookup (per single word, not per whole input string)
            var suggestions = spellingEngine.LookupCompound(tempHtmlLines, maxEditDistanceLookup);

            var    axy        = suggestions[0];
            var    dog        = spellingEngine.WordSegmentation(postSubString);
            string fixedLine  = dog.correctedString;
            string fixedStuff = (tempHtmlLines.Substring(0, (postStartIndex)) + " " + fixedLine).Replace(" * ", "*");

            dialog.SuggestedChatTextTextBox.Text = fixedStuff;

            dialog.ShowDialog();
            if (dialog.DialogResult.HasValue && dialog.DialogResult.Value)
            {
                return(dialog.SuggestedChatTextTextBox.Text + "\r\n");
            }
            else
            {
                return(fixedStuff + "\r\n");
            }
        }
        static void Main(string[] args)
        {
            if (args.Length > 2)
            {
                Console.Error.Write("Creating dictionary ...");
                long      memSize   = GC.GetTotalMemory(true);
                Stopwatch stopWatch = new Stopwatch();
                stopWatch.Start();

                //parameters
                int initialCapacity = 82765;
                int termIndex       = 0; //column of the term in the dictionary text file
                int countIndex      = 1; //column of the term frequency in the dictionary text file

                //dictionaryType
                string dictionaryType = args[0].ToLower();
                if ("load.create".IndexOf(dictionaryType) == -1)
                {
                    Console.Error.WriteLine("Error in parameter 1"); return;
                }

                //dictionaryPath
                string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1];

                //prefix length (optional parameter)
                int    offset       = 0;
                string lookupType   = "";
                int    prefixLength = 7;
                if (!int.TryParse(args[2], out prefixLength))
                {
                    prefixLength = 7;
                }
                else
                {
                    offset = 1;
                }

                //lookupType
                if (args.Length > 2 + offset)
                {
                    lookupType = args[2 + offset].ToLower();
                    if ("lookup.lookupcompound.wordsegment".IndexOf(lookupType) == -1)
                    {
                        Console.Error.WriteLine("Error in parameter " + (3 + offset).ToString()); return;
                    }
                }

                //maxEditDistance
                int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation
                if (args.Length > 3 + offset)
                {
                    if (!int.TryParse(args[3 + offset], out maxEditDistanceDictionary))
                    {
                        Console.Error.WriteLine("Error in parameter " + (4 + offset).ToString()); return;
                    }
                }

                //output stats
                bool outputStats = false;//false, true
                if (args.Length > 4 + offset)
                {
                    if (!bool.TryParse(args[4 + offset], out outputStats))
                    {
                        Console.Error.WriteLine("Error in parameter " + (5 + offset).ToString()); return;
                    }
                }

                //verbosity
                var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All
                if (args.Length > 5 + offset)
                {
                    if (!Enum.TryParse(args[5 + offset], true, out suggestionVerbosity))
                    {
                        Console.Error.WriteLine("Error in parameter " + (6 + offset).ToString()); return;
                    }
                }

                //create object
                var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength);

                //load dictionary
                switch (dictionaryType)
                {
                case "load":
                    if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
                    {
                        Console.Error.WriteLine("File not found!");
                        return;
                    }
                    break;

                case "create":
                    if (!symSpell.CreateDictionary(dictionaryPath))
                    {
                        Console.Error.WriteLine("File not found!");
                        return;
                    }
                    break;

                default:
                    break;
                }

                stopWatch.Stop();
                long memDelta = GC.GetTotalMemory(true) - memSize;

                //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped
                Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, "
                                        + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString()
                                        + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
                                        + (memDelta / 1024 / 1024.0).ToString("N0") + " MB");

                //warm up
                var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All);

                //lookup suggestions for single-word input strings
                string inputTerm;
                while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim()))
                {
                    switch (lookupType)
                    {
                    case "lookup":
                        var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceDictionary, true);
                        //display suggestions, edit distance and term frequency
                        foreach (var suggestion in suggestions)
                        {
                            if (outputStats)
                            {
                                Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0"));
                            }
                            else
                            {
                                Console.WriteLine(suggestion.term);
                            }
                        }
                        break;

                    case "lookupcompound":
                        var suggestions2 = symSpell.LookupCompound(inputTerm);
                        //display suggestions, edit distance and term frequency
                        foreach (var suggestion in suggestions2)
                        {
                            if (outputStats)
                            {
                                Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0"));
                            }
                            else
                            {
                                Console.WriteLine(suggestion.term);
                            }
                        }
                        break;

                    case "wordsegment":
                        var suggestions3 = symSpell.WordSegmentation(inputTerm);
                        //display suggestions, edit distance and term frequency
                        foreach (var suggestion in suggestions3)
                        {
                            if (outputStats)
                            {
                                Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString());
                            }
                            else
                            {
                                Console.WriteLine(suggestion.correctedString);
                            }
                        }
                        break;

                    default:
                        break;
                    }
                }
            }
            else
            {
                //PrefixLength is number

                //help
                Console.WriteLine("SymSpell.CommandLine DictionaryType DictionaryPath [PrefixLength] LookupType [MaxEditDistance] [OutputStats] [Verbosity]");
                Console.WriteLine();
                Console.WriteLine("DictionaryType=load|create");
                Console.WriteLine("   load: load dictionary from dictionary file");
                Console.WriteLine("   create: create dictionary from text corpus");
                Console.WriteLine("DictionaryPath: path to dictionary/corpus file");
                Console.WriteLine("PrefixLength: default=7 (speed/memory consumption trade-off)");  //dictionary param
                Console.WriteLine("   5: low memory, slow lookup");
                Console.WriteLine("   6: medium memory, medium lookup");
                Console.WriteLine("   7: high memory, fast lookup");
                //lookup intended for correction of single word
                //lookupcompound intended for correction of multiple words, it can insert only a single space per token, faster than wordsegmentation
                //wordsegmentation intended for segmentation and correction of multiple words, it can insert multiple spaces per token, slower than lookupcompound
                Console.WriteLine("LookupType=lookup|lookupcompound|wordsegment");
                Console.WriteLine("   lookup: correct single word");
                Console.WriteLine("   lookupcompound: correct multiple-word string (supports splitting/merging)");
                Console.WriteLine("   wordsegment: word segment and correct input string");
                Console.WriteLine("MaxEditDistance: default=2 (0: no correction, word segmentation only)");
                Console.WriteLine("OutputStats=false|true");
                Console.WriteLine("   false: only corrected string");
                Console.WriteLine("   true: corrected string, edit distance, word frequency/probability");
                Console.WriteLine("Verbosity=top|closest|all"); //no effect for lookupcompound and wordsegment
                Console.WriteLine("   top: Top suggestion");
                Console.WriteLine("   closest: All suggestions of smallest edit distance found");
                Console.WriteLine("   all: All suggestions within maxEditDistance");
                Console.WriteLine();
            }
        }
Beispiel #5
0
        static void Main(string[] args)
        {
            //Console.WriteLine("Hello World!");
            //create object
            int initialCapacity           = 82765;
            int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation
            var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary);

            //load dictionary
            string baseDirectory = AppDomain.CurrentDomain.BaseDirectory;

            //string dictionaryPath= baseDirectory + "frequency_dictionary_en_82_765.txt";
            Console.WriteLine(baseDirectory);

            string dictionaryPath = baseDirectory + "../../../frequency_dictionary_en_82_765.txt";

            int termIndex  = 0; //column of the term in the dictionary text file
            int countIndex = 1; //column of the term frequency in the dictionary text file

            if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
            {
                Console.WriteLine("File not found!");
                //press any key to exit program
                Console.ReadKey();
                return;
            }

            int    i = 0;
            string contentUndetermine   = "";
            string contentDate          = "";
            string contentScore         = "";
            string contentLineOfReviews = "";

            /*
             *  http://www.vcskicks.com/read_text_file.php
             *  here is a method provided by vcskicks.com which allow user to read the whole document at once
             *  and pass all the content as a single string
             *  decide not to read the whole document as pass all the content as a string since the string may be so huge
             *
             */


            // string path = "C:/Users/kongwh/Desktop/test2/t1.txt";

            // StreamReader textFile = new StreamReader(path);

            // string input = textFile.ReadToEnd();

            // textFile.Close();

            /*
             *  https://www.tutorialspoint.com/csharp/csharp_text_files.htm
             *  here is a method provided by tutorial point which read a document line by line
             *  and pass each line as a string
             *
             *  and write string to a document
             */

            string line = "";
            //https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/main-and-command-args/command-line-arguments
            string nameOfadjustedDocument = args[0];

            //https://www.geeksforgeeks.org/c-sharp-insert-method/
            nameOfadjustedDocument = nameOfadjustedDocument.Insert(nameOfadjustedDocument.Length - 4, "_adj");

            using (StreamWriter sw = new StreamWriter(nameOfadjustedDocument)){
                //using(StreamWriter sw = new StreamWriter(adj.txt)){
                using (StreamReader sr = new StreamReader(args[0])) {
                    //using (StreamReader sr = new StreamReader("t1.txt")) {
                    while ((line = sr.ReadLine()) != null)
                    {
                        //Console.WriteLine("line: " + line);
                        //Console.WriteLine("line length: " + line.Length);

                        contentUndetermine = line;

                        if (isFormatOfDate(contentUndetermine))
                        {
                            //contentUndetermine is a date
                            contentDate = contentUndetermine;
                            i           = 1;
                            Console.WriteLine(contentDate);
                            sw.WriteLine(contentDate);
                        }
                        else if (i == 1)
                        {
                            //contentUndetermine is a score
                            contentScore = contentUndetermine;
                            Console.WriteLine(contentScore);
                            sw.WriteLine(contentScore);
                            i = 2;
                        }
                        else
                        {
                            string onlyEnglishAndSpace = "";
                            string notEnglishAndSpace  = "";
                            string adjustedReviewLine  = "";

                            contentLineOfReviews = contentUndetermine;

                            foreach (char charInLine in contentLineOfReviews)
                            {
                                if (charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine))
                                {
                                    onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString();
                                }
                                else
                                {
                                    notEnglishAndSpace = charInLine.ToString();

                                    if (onlyEnglishAndSpace.Equals(""))
                                    {
                                        adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace;
                                    }
                                    else
                                    {
                                        //word segmentation and correction for multi-word input strings with/without spaces
                                        var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace);
                                        adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace;
                                    }

                                    onlyEnglishAndSpace = "";
                                    notEnglishAndSpace  = "";
                                }
                            }


                            if (!onlyEnglishAndSpace.Equals(""))
                            {
                                //word segmentation and correction for multi-word input strings with/without spaces
                                var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace);
                                adjustedReviewLine = adjustedReviewLine + suggestion.correctedString;
                            }

                            Console.WriteLine(adjustedReviewLine);
                            sw.WriteLine(adjustedReviewLine);
                            i++;
                        }
                    }
                }
            }


            //----------------------------sample input----------------------------------------
            // //sample input
            // string input="January 25, 2019\n5\ngooood😋\nJune 25, 2019\n1\nsofarsogood\n";

            // //Console.WriteLine("input length: " + input.Length);
            // foreach(char c in input){

            //     contentUndetermine = contentUndetermine + c.ToString( );
            //     //Console.WriteLine("current: " + contentUndetermine);
            //     if(contentUndetermine.Contains("\r") || contentUndetermine.Contains("\n")){
            //         if(isFormatOfDate(contentUndetermine)){
            //             //contentUndetermine is a date
            //             contentDate = contentUndetermine;
            //             Console.WriteLine(contentDate);
            //             i = 1;
            //         }else if(i == 1){
            //             //contentUndetermine is a score
            //             contentScore = contentUndetermine;
            //             Console.WriteLine(contentScore);
            //             i = 2;
            //         }else{
            //             string onlyEnglishAndSpace = "";
            //             string notEnglishAndSpace = "";
            //             string adjustedReviewLine = "";

            //             contentLineOfReviews = contentUndetermine;
            //             foreach(char charInLine in contentLineOfReviews){
            //                 if(charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine)){
            //                     onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString();
            //                 }else{
            //                     notEnglishAndSpace = charInLine.ToString();

            //                     if(onlyEnglishAndSpace.Equals("")){
            //                         adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace;
            //                     }else{
            //                         //word segmentation and correction for multi-word input strings with/without spaces
            //                         var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace);
            //                         adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace;
            //                     }

            //                     onlyEnglishAndSpace = "";
            //                     notEnglishAndSpace = "";

            //                 }
            //             }
            //             // //word segmentation and correction for multi-word input strings with/without spaces
            //             // var suggestion = symSpell.WordSegmentation(contentLineOfReviews);

            //             // //display term and edit distance
            //             // Console.WriteLine(suggestion.correctedString);

            //             //Console.WriteLine(contentLineOfReviews);
            //             Console.WriteLine(adjustedReviewLine);
            //             adjustedReviewLine = "";
            //             i++;
            //         }
            //         //clear the content
            //         contentUndetermine = "";
            //     }
            // }
            //----------------------------sample input ends----------------------------------------

            //----------------functions used for spell check provided by symSpell------------------

            // //word segmentation and correction for multi-word input strings with/without spaces
            // var suggestion1 = symSpell.WordSegmentation(input);

            // //display term and edit distance
            // Console.WriteLine(suggestion1.correctedString);

            // //lookup suggestions for single-word input strings
            // string inputTerm="goodandnicedesign";
            // //string inputTerm=suggestion1.correctedString;
            // int maxEditDistanceLookup = 1; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary)
            // var suggestionVerbosity = SymSpell.Verbosity.Closest; //Top, Closest, All
            // var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceLookup);
            // //lookup suggestions for multi-word input strings (supports compound splitting & merging)
            // //inputTerm="whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him";
            // maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string)
            // suggestions = symSpell.LookupCompound(inputTerm, maxEditDistanceLookup);

            // //display suggestions, edit distance and term frequency
            // foreach (var suggestion in suggestions)
            // {
            // Console.WriteLine(suggestion.term);
            // }



            //press any key to exit program
            //Console.ReadKey();
        }
        public (string segmentedString, string correctedString, int distanceSum, decimal probabilityLogSum) segmentText(string text, int distance)
        {
            var suggestion = symSpell.WordSegmentation(text, distance);

            return(suggestion);
        }