Exemple #1
0
        public SymSpell CreateDictionary(out string ErrorMsg)
        {
            ErrorMsg = string.Empty;
            try
            {
                long      memSize   = GC.GetTotalMemory(true);
                Stopwatch stopWatch = new Stopwatch();
                stopWatch.Start();
                const int initialCapacity = 82765;
                const int maxEditDistance = 2;
                const int prefixLength    = 7;
                var       symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);
                string    path            = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt";
                if (!symSpell.LoadDictionary(path, 0, 1))
                {
                    return(null);
                }

                stopWatch.Stop();
                long memDelta = GC.GetTotalMemory(true) - memSize;
                var  result   = symSpell.Lookup("warmup", SymSpell.Verbosity.All);
                return(symSpell);
            }
            catch (Exception ex)
            {
                ErrorMsg = ex.ToString();
                return(null);
            }
        }
Exemple #2
0
        private static void AddPostProcessing(IServiceCollection services)
        {
            var symSpell = new SymSpell();

            Console.Out.WriteLine("Loading SymSpell dictionary...");
            {
                symSpell.LoadDictionary("../../ru.dict", termIndex: 0, countIndex: 1);
            }
            Console.Out.WriteLine("SymSpell initialized!");

            var postProcessor = new CombinedProcessor(new ITextPostProcessor[]
            {
                new RemoveEmptyLinesProcessor(new RemoveEmptyLinesOptions
                {
                    NormalizeLineEndings = NormalizeLineEndingsStrategy.Lf
                }),

                new PerWordProcessor(new ITextPostProcessor[]
                {
                    new SymSpellProcessor(symSpell, 1, Enumerable.Empty <string>())
                })
            });

            services.AddSingleton(postProcessor);
        }
Exemple #3
0
        private void button1_Click(object sender, EventArgs e)
        {
            //create object
            int initialCapacity           = 82765;
            int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation
            var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary);


            //load dictionary
            string dictionaryPath = "../../frequency_dictionary_en_82_765.txt";
            int    termIndex      = 0; //column of the term in the dictionary text file
            int    countIndex     = 1; //column of the term frequency in the dictionary text file

            if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
            {
                richTextBox1.Text = "File not found!";
            }


            var    suggList       = new List <string>();
            string wrongWord      = richTextBox1.Text.ToString();
            string lowerWrongWord = wrongWord.ToLower();


            int maxEditDistanceLookup = 1;                          //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary)
            var suggestionVerbosity   = SymSpell.Verbosity.Closest; //Top, Closest, All
            var suggestions           = symSpell.Lookup(lowerWrongWord, suggestionVerbosity, maxEditDistanceLookup);


            foreach (var suggestion in suggestions)
            {
                listBox1.Items.Add(suggestion.term.ToString());
            }
        }
Exemple #4
0
        // pre-run to ensure code has executed once before timing benchmarks
        static void WarmUp()
        {
            SymSpell dict = new SymSpell(16, 2, 7);

            dict.LoadDictionary(DictionaryPath[0], 0, 1);
            var result = dict.Lookup("hockie", SymSpell.Verbosity.All, 1);

            Original.SymSpell dictOrig = new Original.SymSpell(2, 7);
            dictOrig.LoadDictionary(DictionaryPath[0], "", 0, 1);
            var resultOrig = dictOrig.Lookup("hockie", "", 1, 2);
        }
        static void Main(string[] args)
        {
            //set parameters
            const int initialCapacity = 82765;
            const int maxEditDistance = 0;
            const int prefixLength    = 7;
            SymSpell  symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);

            Console.Write("Creating dictionary ...");
            long      memSize   = GC.GetTotalMemory(true);
            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();

            //Load a frequency dictionary
            //wordfrequency_en.txt  ensures high correction quality by combining two data sources:
            //Google Books Ngram data  provides representative word frequencies (but contains many entries with spelling errors)
            //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies)
            string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project

            //string path = "../../frequency_dictionary_en_82_765.txt";  //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)
            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return;
            }

            //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt )
            //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction.
            //The dictionary may contain vocabulary from different languages.
            //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly.
            //You may use SymSpellCompound.CreateDictionaryEntry() to update a (self learning) dictionary incrementally
            //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry().
            //string path = "big.txt"
            //if (!SymSpellCompound.CreateDictionary(path,"")) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));

            stopWatch.Stop();
            long memDelta = GC.GetTotalMemory(true) - memSize;

            Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, "
                              + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString()
                              + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
                              + (memDelta / 1024 / 1024.0).ToString("N0") + " MB");

            //warm up
            var result = symSpell.WordSegmentation("isit");

            string input;

            Console.WriteLine("Type in a text and hit enter to get word segmentation and correction:");
            while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim()))
            {
                Correct(input, symSpell);
            }
        }
        private static void Experimento1()
        {
            Stopwatch stopWatch = new Stopwatch();
            string    strPath   = @"D:\json\";

            string[]      fileEntries = Directory.GetFiles(strPath);
            StringBuilder OCROriginal = new StringBuilder();

            EditDistanceLength editDistance = new EditDistanceLength();
            //Symspell parameters
            const int initialCapacity = 82765;
            const int maxEditDistance = 5;
            const int prefixLength    = 7;
            SymSpell  symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);
            Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>();

            foreach (string path in fileEntries)
            {
                string jsonText = File.ReadAllText(path, Encoding.Default);
                var    response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText);
                foreach (var respuestas in response.Responses)
                {
                    var annotation = respuestas.FullTextAnnotation;
                    if (annotation != null)
                    {
                        OCROriginal.Append(annotation.Text);
                    }
                }
            }

            symSpell.LoadDictionary(@"D:\DictionaryFiles\default.txt", 0, 1);
            var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' ');

            int j = 0, k = 0;

            foreach (string item in arrayOCROriginal)
            {
                ExperimentSpell exp1 = new ExperimentSpell();
                exp1.correction = "igual";
                exp1.original   = item;
                exp1.correctionLookupCompound = item;

                List <SymSpell.SuggestItem> suggestions = symSpell.Lookup(item, SymSpell.Verbosity.Top);
                if (suggestions.Count > 0)
                {
                    exp1.correction = "modificada";
                    exp1.correctionLookupCompound = suggestions[0].term;
                }
                excelMatrix.Add(k++, exp1);
            }
            CreateExcelFileExperimento(excelMatrix, "1");
        }
        //Load a frequency dictionary or create a frequency dictionary from a text corpus
        public static void Main(string[] args)
        {
            Console.Write("Creating dictionary ...");
            long      memSize   = GC.GetTotalMemory(true);
            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();

            //set parameters
            const int initialCapacity = 82765;
            const int maxEditDistance = 2;
            const int prefixLength    = 7;
            var       symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);

            //Load a frequency dictionary
            //wordfrequency_en.txt  ensures high correction quality by combining two data sources:
            //Google Books Ngram data  provides representative word frequencies (but contains many entries with spelling errors)
            //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies)
            //string path = "../../../SymSpell.Demo/test_data/frequency_dictionary_en_30_000.txt"; //for benchmark only (contains also non-genuine English words)
            //string path = "../../../SymSpell.Demo/test_data/frequency_dictionary_en_500_000.txt"; //for benchmark only (contains also non-genuine English words)
            string path = "../../../SymSpell/frequency_dictionary_en_82_765.txt";    //for spelling correction (genuine English words)

            //string path = "../../frequency_dictionary_en_82_765.txt";  //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)
            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));                                       //path when using symspell.cs
            }
            //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt )
            //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction.
            //You may use SymSpell.CreateDictionaryEntry() to update a (self learning) dictionary incrementally
            //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). or use  https://github.com/wolfgarbe/SymSpellCompound
            //string path = "big.txt";
            //if (!symSpell.CreateDictionary(path)) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));

            stopWatch.Stop();
            long memDelta = GC.GetTotalMemory(true) - memSize;

            Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, "
                              + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString()
                              + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
                              + (memDelta / 1024 / 1024.0).ToString("N0") + " MB");

            //warm up
            var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All, 1);

            string input;

            while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim()))
            {
                Correct(input, symSpell);
            }
        }
Exemple #8
0
        public void initDict()
        {
            int initialCapacity           = 20000;
            int maxEditDistanceDictionary = 3;             //maximum edit distance per dictionary precalculation

            symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary);
            TextAsset dictionaryPath = Resources.Load <TextAsset>("dataset");
            int       termIndex      = 0;  //column of the term in the dictionary text file
            int       countIndex     = 1;  //column of the term frequency in the dictionary text file

            if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
            {
                Debug.Log("Unable to load dictionary");
            }
        }
Exemple #9
0
        //Load a frequency dictionary or create a frequency dictionary from a text corpus
        public static void Main(string[] args)
        {
            //set global parameters
            SymSpell.verbose         = 0;
            SymSpell.editDistanceMax = 2;
            SymSpell.lp = 7;

            Console.Write("Creating dictionary ...");
            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();

            //Load a frequency dictionary
            //wordfrequency_en.txt  ensures high correction quality by combining two data sources:
            //Google Books Ngram data  provides representative word frequencies (but contains many entries with spelling errors)
            //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies)
            //string path = "../../../symspelldemo/test_data/frequency_dictionary_en_30_000.txt"; //for benchmark only (contains also non-genuine English words)
            //string path = "../../../symspelldemo/test_data/frequency_dictionary_en_500_000.txt"; //for benchmark only (contains also non-genuine English words)
            string path = "../../../symspell/frequency_dictionary_en_82_765.txt";    //for spelling correction (genuine English words)

            //string path = "../../frequency_dictionary_en_82_765.txt";  //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)
            if (!SymSpell.LoadDictionary(path, "", 0, 1))
            {
                Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));                                           //path when using symspell.cs
            }
            //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt )
            //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction.
            //The dictionary may contain vocabulary from different languages.
            //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly.
            //You may use SymSpell.CreateDictionaryEntry() to update a (self learning) dictionary incrementally
            //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). or use  https://github.com/wolfgarbe/SymSpellCompound
            //string path = "big.txt";
            //if (!SymSpell.CreateDictionary(path,"")) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));

            stopWatch.Stop();
            Console.WriteLine("\rDictionary: " + SymSpell.wordlist.Count.ToString("N0") + " words, " + SymSpell.dictionary.Count.ToString("N0") + " entries, edit distance=" + SymSpell.editDistanceMax.ToString() + " in " + stopWatch.ElapsedMilliseconds.ToString() + "ms " + (Process.GetCurrentProcess().PrivateMemorySize64 / 1000000).ToString("N0") + " MB");

            //Benchmark("../../../symspelldemo/test_data/noisy_query_en_1000.txt",1000);

            string input;

            while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim()))
            {
                Correct(input, "");
            }
        }
Exemple #10
0
        private void InitSym()
        {
            //create object
            int initialCapacity           = 82765;
            int maxEditDistanceDictionary = 2;             //maximum edit distance per dictionary precalculation

            sym = new SymSpell(initialCapacity, maxEditDistanceDictionary);

            //load dictionary
            string dictionaryPath = Path.Combine(Application.streamingAssetsPath, "SymSpell", "frequency_dictionary_en_82_765.txt");
            int    termIndex      = 0;     //column of the term in the dictionary text file
            int    countIndex     = 1;     //column of the term frequency in the dictionary text file

            if (!sym.LoadDictionary(dictionaryPath, termIndex, countIndex))
            {
                Debug.LogError("Dictionary file not found! Aborting...");
                return;
            }
        }
Exemple #11
0
    public void LoadDictionary()
    {
        //create object
        int initialCapacity           = 82765;
        int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation

        symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary);

        //load dictionary
        string dictionaryPath = Application.dataPath + @"\SymSpell\frequency_dictionary_en_82_765.txt";

        int termIndex  = 0; //column of the term in the dictionary text file
        int countIndex = 1; //column of the term frequency in the dictionary text file

        if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
        {
            Debug.Log("File not found!");
            return;
        }
    }
        public SymSpellInterface()
        {
            int initialCapacity           = 82765;
            int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation

            symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary);

            //load dictionary
            string baseDirectory  = AppDomain.CurrentDomain.BaseDirectory;
            string dictionaryPath = baseDirectory + "frequency_dictionary_en_82_765.txt";
            int    termIndex      = 0; //column of the term in the dictionary text file
            int    countIndex     = 1; //column of the term frequency in the dictionary text file

            if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
            {
                Console.WriteLine("File not found " + dictionaryPath);
                //press any key to exit program
            }

            Console.WriteLine("SymSpellInterface was initialized. You are ready to go!");
        }
Exemple #13
0
        public SpellChecker()
        {
            compileDictionary();

            // init spell checker
            const int InitialCapacity           = 82765;
            const int MaxDistanceEditDictionary = 2;

            this.spellChecker = new SymSpell(InitialCapacity, MaxDistanceEditDictionary);


            //column of the term in the dictionary text file
            int termIndex = 0;

            //column of the term frequency in the dictionary text file
            int countIndex = 1;

            if (!spellChecker.LoadDictionary(CompiledDictionary, termIndex, countIndex))
            {
                throw new FileNotFoundException("Dictionary Not found!");
            }
        }
Exemple #14
0
        public void LookupShouldReplicateNoisyResults()
        {
            var dir = AppDomain.CurrentDomain.BaseDirectory;

            const int editDistanceMax          = 2;
            const int prefixLength             = 7;
            const SymSpell.Verbosity verbosity = SymSpell.Verbosity.Closest;
            var    symSpell = new SymSpell(83000, editDistanceMax, prefixLength);
            string path     = dir + "../../../SymSpell/frequency_dictionary_en_82_765.txt"; //for spelling correction (genuine English words)

            symSpell.LoadDictionary(path, 0, 1);

            //load 1000 terms with random spelling errors
            string[] testList = new string[1000];
            int      i        = 0;

            using (StreamReader sr = new StreamReader(File.OpenRead(dir + "../../../SymSpell.Demo/test_data/noisy_query_en_1000.txt")))
            {
                String line;
                //process a single line at a time only for memory efficiency
                while ((line = sr.ReadLine()) != null)
                {
                    string[] lineParts = line.Split(null);
                    if (lineParts.Length >= 2)
                    {
                        testList[i++] = lineParts[0];
                    }
                }
            }

            int resultSum = 0;

            for (i = 0; i < testList.Length; i++)
            {
                resultSum += symSpell.Lookup(testList[i], verbosity, symSpell.MaxDictionaryEditDistance).Count;
            }
            Assert.AreEqual(4945, resultSum);
        }
    // Use this for initialization
    void Start()
    {
        if (!targetkeyboard)
        {
            targetkeyboard = KeyboardLayout.Instance;
        }
        if (!targetkeyboard)
        {
            Debug.LogError("Target Keyboard Empty");
        }
        else
        {
            targetkeyboard.KeyboardLayout_OnKeyPressed += WordPrediction_KeyPressedHandler;
        }
        Debug.Log("Creating dictionary ...");

        //set parameters
        const int initialCapacity = 82765;
        const int maxEditDistance = 2;
        const int prefixLength    = 7;

        symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength);

        //Load a frequency dictionary
        //wordfrequency_en.txt  ensures high correction quality by combining two data sources:
        //Google Books Ngram data  provides representative word frequencies (but contains many entries with spelling errors)
        //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies)
        string path = Application.dataPath + @"\SpellChecker\Resources\frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project

        //string path = "../../frequency_dictionary_en_82_765.txt";  //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)
        if (!symSpell.LoadDictionary(path, 0, 1))
        {
            Debug.LogError("\rFile not found: " + System.IO.Path.GetFullPath(path));
        }

        //warm up
        var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All);
    }
Exemple #16
0
        public static List <string> SymEnglishSpellChecker(string word)
        {
            //Console.Write("Creating dictionary ...");
            //long memSize = GC.GetTotalMemory(true);
            //Stopwatch stopWatch = new Stopwatch();
            //stopWatch.Start();

            //set parameters
            const int initialCapacity = 82765;
            const int maxEditDistance = 2;
            const int prefixLength    = 7;
            var       symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);

            //Load a frequency dictionary
            //wordfrequency_en.txt  ensures high correction quality by combining two data sources:
            //Google Books Ngram data  provides representative word frequencies (but contains many entries with spelling errors)
            //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies)
            string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project

            //string path = "../../frequency_dictionary_en_82_765.txt";  //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)

            /******************/

            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                throw new Exception("f'le not found");
            }

            /******************/
            //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt )
            //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction.
            //You may use SymSpell.CreateDictionaryEntry() to update a (self learning) dictionary incrementally
            //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). or use  https://github.com/wolfgarbe/SymSpellCompound
            //string path = "big.txt";
            //if (!symSpell.CreateDictionary(path)) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));

            //stopWatch.Stop();
            //long memDelta = GC.GetTotalMemory(true) - memSize;

            //Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, "
            //    + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString()
            //    + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
            //    + (memDelta / 1024 / 1024.0).ToString("N0") + " MB");

            //warm up
            //var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All);

            string input = word;

            /**/
            //Console.WriteLine("Type a work and hit enter key to get spelling suggestions:");
            /***/

            /*
             * //while (!string.IsNullOrEmpty(input.Trim()))
             * //{6
             * //  return Correct(input, symSpell);
             * //}
             */
            return(Correct(input, symSpell));
        }
Exemple #17
0
        static void Main(string[] args)
        {
            //Console.WriteLine("Hello World!");
            //create object
            int initialCapacity           = 82765;
            int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation
            var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary);

            //load dictionary
            string baseDirectory = AppDomain.CurrentDomain.BaseDirectory;

            //string dictionaryPath= baseDirectory + "frequency_dictionary_en_82_765.txt";
            Console.WriteLine(baseDirectory);

            string dictionaryPath = baseDirectory + "../../../frequency_dictionary_en_82_765.txt";

            int termIndex  = 0; //column of the term in the dictionary text file
            int countIndex = 1; //column of the term frequency in the dictionary text file

            if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
            {
                Console.WriteLine("File not found!");
                //press any key to exit program
                Console.ReadKey();
                return;
            }

            int    i = 0;
            string contentUndetermine   = "";
            string contentDate          = "";
            string contentScore         = "";
            string contentLineOfReviews = "";

            /*
             *  http://www.vcskicks.com/read_text_file.php
             *  here is a method provided by vcskicks.com which allow user to read the whole document at once
             *  and pass all the content as a single string
             *  decide not to read the whole document as pass all the content as a string since the string may be so huge
             *
             */


            // string path = "C:/Users/kongwh/Desktop/test2/t1.txt";

            // StreamReader textFile = new StreamReader(path);

            // string input = textFile.ReadToEnd();

            // textFile.Close();

            /*
             *  https://www.tutorialspoint.com/csharp/csharp_text_files.htm
             *  here is a method provided by tutorial point which read a document line by line
             *  and pass each line as a string
             *
             *  and write string to a document
             */

            string line = "";
            //https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/main-and-command-args/command-line-arguments
            string nameOfadjustedDocument = args[0];

            //https://www.geeksforgeeks.org/c-sharp-insert-method/
            nameOfadjustedDocument = nameOfadjustedDocument.Insert(nameOfadjustedDocument.Length - 4, "_adj");

            using (StreamWriter sw = new StreamWriter(nameOfadjustedDocument)){
                //using(StreamWriter sw = new StreamWriter(adj.txt)){
                using (StreamReader sr = new StreamReader(args[0])) {
                    //using (StreamReader sr = new StreamReader("t1.txt")) {
                    while ((line = sr.ReadLine()) != null)
                    {
                        //Console.WriteLine("line: " + line);
                        //Console.WriteLine("line length: " + line.Length);

                        contentUndetermine = line;

                        if (isFormatOfDate(contentUndetermine))
                        {
                            //contentUndetermine is a date
                            contentDate = contentUndetermine;
                            i           = 1;
                            Console.WriteLine(contentDate);
                            sw.WriteLine(contentDate);
                        }
                        else if (i == 1)
                        {
                            //contentUndetermine is a score
                            contentScore = contentUndetermine;
                            Console.WriteLine(contentScore);
                            sw.WriteLine(contentScore);
                            i = 2;
                        }
                        else
                        {
                            string onlyEnglishAndSpace = "";
                            string notEnglishAndSpace  = "";
                            string adjustedReviewLine  = "";

                            contentLineOfReviews = contentUndetermine;

                            foreach (char charInLine in contentLineOfReviews)
                            {
                                if (charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine))
                                {
                                    onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString();
                                }
                                else
                                {
                                    notEnglishAndSpace = charInLine.ToString();

                                    if (onlyEnglishAndSpace.Equals(""))
                                    {
                                        adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace;
                                    }
                                    else
                                    {
                                        //word segmentation and correction for multi-word input strings with/without spaces
                                        var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace);
                                        adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace;
                                    }

                                    onlyEnglishAndSpace = "";
                                    notEnglishAndSpace  = "";
                                }
                            }


                            if (!onlyEnglishAndSpace.Equals(""))
                            {
                                //word segmentation and correction for multi-word input strings with/without spaces
                                var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace);
                                adjustedReviewLine = adjustedReviewLine + suggestion.correctedString;
                            }

                            Console.WriteLine(adjustedReviewLine);
                            sw.WriteLine(adjustedReviewLine);
                            i++;
                        }
                    }
                }
            }


            //----------------------------sample input----------------------------------------
            // //sample input
            // string input="January 25, 2019\n5\ngooood😋\nJune 25, 2019\n1\nsofarsogood\n";

            // //Console.WriteLine("input length: " + input.Length);
            // foreach(char c in input){

            //     contentUndetermine = contentUndetermine + c.ToString( );
            //     //Console.WriteLine("current: " + contentUndetermine);
            //     if(contentUndetermine.Contains("\r") || contentUndetermine.Contains("\n")){
            //         if(isFormatOfDate(contentUndetermine)){
            //             //contentUndetermine is a date
            //             contentDate = contentUndetermine;
            //             Console.WriteLine(contentDate);
            //             i = 1;
            //         }else if(i == 1){
            //             //contentUndetermine is a score
            //             contentScore = contentUndetermine;
            //             Console.WriteLine(contentScore);
            //             i = 2;
            //         }else{
            //             string onlyEnglishAndSpace = "";
            //             string notEnglishAndSpace = "";
            //             string adjustedReviewLine = "";

            //             contentLineOfReviews = contentUndetermine;
            //             foreach(char charInLine in contentLineOfReviews){
            //                 if(charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine)){
            //                     onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString();
            //                 }else{
            //                     notEnglishAndSpace = charInLine.ToString();

            //                     if(onlyEnglishAndSpace.Equals("")){
            //                         adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace;
            //                     }else{
            //                         //word segmentation and correction for multi-word input strings with/without spaces
            //                         var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace);
            //                         adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace;
            //                     }

            //                     onlyEnglishAndSpace = "";
            //                     notEnglishAndSpace = "";

            //                 }
            //             }
            //             // //word segmentation and correction for multi-word input strings with/without spaces
            //             // var suggestion = symSpell.WordSegmentation(contentLineOfReviews);

            //             // //display term and edit distance
            //             // Console.WriteLine(suggestion.correctedString);

            //             //Console.WriteLine(contentLineOfReviews);
            //             Console.WriteLine(adjustedReviewLine);
            //             adjustedReviewLine = "";
            //             i++;
            //         }
            //         //clear the content
            //         contentUndetermine = "";
            //     }
            // }
            //----------------------------sample input ends----------------------------------------

            //----------------functions used for spell check provided by symSpell------------------

            // //word segmentation and correction for multi-word input strings with/without spaces
            // var suggestion1 = symSpell.WordSegmentation(input);

            // //display term and edit distance
            // Console.WriteLine(suggestion1.correctedString);

            // //lookup suggestions for single-word input strings
            // string inputTerm="goodandnicedesign";
            // //string inputTerm=suggestion1.correctedString;
            // int maxEditDistanceLookup = 1; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary)
            // var suggestionVerbosity = SymSpell.Verbosity.Closest; //Top, Closest, All
            // var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceLookup);
            // //lookup suggestions for multi-word input strings (supports compound splitting & merging)
            // //inputTerm="whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him";
            // maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string)
            // suggestions = symSpell.LookupCompound(inputTerm, maxEditDistanceLookup);

            // //display suggestions, edit distance and term frequency
            // foreach (var suggestion in suggestions)
            // {
            // Console.WriteLine(suggestion.term);
            // }



            //press any key to exit program
            //Console.ReadKey();
        }
        static void Main(string[] args)
        {
            if (args.Length >= 2)
            {
                Console.Error.Write("Creating dictionary ...");
                long      memSize   = GC.GetTotalMemory(true);
                Stopwatch stopWatch = new Stopwatch();
                stopWatch.Start();

                //parameters
                int initialCapacity = 82765;

                int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation
                if (args.Length > 2)
                {
                    if (!int.TryParse(args[2], out maxEditDistanceDictionary))
                    {
                        Console.Error.WriteLine("Error in parameter 3"); return;
                    }
                }
                int maxEditDistanceLookup = maxEditDistanceDictionary; //max edit distance per lookup

                var suggestionVerbosity = SymSpell.Verbosity.Top;      //Top, Closest, All
                if (args.Length > 3)
                {
                    if (!Enum.TryParse(args[3], out suggestionVerbosity))
                    {
                        Console.Error.WriteLine("Error in parameter 4"); return;
                    }
                }

                int prefixLength = 7;
                if (args.Length > 4)
                {
                    if (!int.TryParse(args[4], out prefixLength))
                    {
                        Console.Error.WriteLine("Error in parameter 5"); return;
                    }
                }

                string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1]; // "../../../../SymSpell/frequency_dictionary_en_82_765.txt";
                int    termIndex      = 0;                                               //column of the term in the dictionary text file
                int    countIndex     = 1;                                               //column of the term frequency in the dictionary text file

                //create object
                var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength);

                //load dictionary
                switch (args[0].ToLower())
                {
                case "load":
                    if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
                    {
                        Console.Error.WriteLine("File not found!");
                        return;
                    }
                    break;

                case "create":
                    if (!symSpell.CreateDictionary(dictionaryPath))
                    {
                        Console.Error.WriteLine("File not found!");
                        return;
                    }
                    break;

                default:
                    break;
                }

                stopWatch.Stop();
                long memDelta = GC.GetTotalMemory(true) - memSize;

                //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped
                Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, "
                                        + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString()
                                        + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
                                        + (memDelta / 1024 / 1024.0).ToString("N0") + " MB");

                //warm up
                var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All, 1);

                //lookup suggestions for single-word input strings
                string inputTerm;
                while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim()))
                {
                    var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceLookup, true);

                    //display suggestions, edit distance and term frequency
                    foreach (var suggestion in suggestions)
                    {
                        Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0"));
                    }
                }
            }
            else
            {
                //help
                Console.WriteLine("SymSpell.CommandLine load   Path [MaxEditDistance] [Verbosity] [PrefixLength]");
                Console.WriteLine("SymSpell.CommandLine create Path [MaxEditDistance] [Verbosity] [PrefixLength]");
                Console.WriteLine();
                Console.WriteLine("load: load dictionary from dictionary file");
                Console.WriteLine("create: create dictionary from text corpus");
                Console.WriteLine("MaxEditDistance: default=2");
                Console.WriteLine("Verbosity=Top|Closest|All (case-sensitive)");
                Console.WriteLine("PrefixLength: default=7 (5:low memory; 7:fast lookup)");
                Console.WriteLine();
            }
        }
Exemple #19
0
        private static async Task Main(string[] args)
        {
            var directory = new DirectoryInfo("temp_git");

            if (directory.Exists)
            {
                NormalizeDirectoryAttributes(directory);
                directory.Delete(true);
            }

            void NormalizeDirectoryAttributes(DirectoryInfo directoryInfo)
            {
                foreach (var subPath in directoryInfo.GetDirectories())
                {
                    NormalizeDirectoryAttributes(subPath);
                }

                foreach (var file in directoryInfo.GetFiles())
                {
                    file.Attributes = FileAttributes.Normal;
                }
            }

            await Task.Delay(TimeSpan.FromSeconds(1));

            var info = new ProcessStartInfo("git", "clone https://github.com/discord-csharp/MODiX temp_git");
            var p    = Process.Start(info);

            if (p == null)
            {
                throw new InvalidOperationException("process handle was null");
            }
            p.WaitForExit();

            await Task.Delay(TimeSpan.FromSeconds(2));

            var extensions = new[] { ".txt", ".md", ".cs" };

            var spellingInfos = new List <FileSpellingInfo>();
            var spell         = new SymSpell();

            if (!spell.LoadDictionary(Path.Combine(Environment.CurrentDirectory, "frequency_dictionary_en_82_765.txt"), 0, 1))
            {
                throw new InvalidOperationException();
            }
            foreach (var file in Directory.GetFiles(Path.Combine(Environment.CurrentDirectory, "temp_git"), "*", SearchOption.AllDirectories))
            {
                if (!extensions.Contains(Path.GetExtension(file)))
                {
                    continue;
                }

                var spellingInfo = new FileSpellingInfo {
                    Path = file
                };
                var fileContents = File.ReadAllLines(file);
                for (var i = 0; i < fileContents.Length; i++)
                {
                    var line = fileContents[i].Trim();
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    spellingInfo.LineMistakes.Add((i + 1, line), new List <(string, string)>());
                    var words = line.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(c => c.RemoveSpecialCharacters());
                    foreach (var word in words.Where(c => !string.IsNullOrWhiteSpace(c)))
                    {
                        var results = spell.Lookup(word.ToLower(), SymSpell.Verbosity.Top);
                        if (results == null || results.Any() == false)
                        {
                            continue;
                        }
                        var suggestion = results.First();
                        if (suggestion.term == word.ToLower())
                        {
                            continue;
                        }
                        spellingInfo.LineMistakes[(i + 1, line)].Add((word, suggestion.term));
Exemple #20
0
        static void BenchmarkPrecalculationLookup()
        {
            string[] query1k = BuildQuery1K();
            int      resultNumber = 0;
            int      repetitions = 1000;
            int      totalLoopCount = 0;
            long     totalMatches = 0;
            long     totalOrigMatches = 0;
            double   totalLoadTime, totalMem, totalLookupTime, totalOrigLoadTime, totalOrigMem, totalOrigLookupTime;

            totalLoadTime = totalMem = totalLookupTime = totalOrigLoadTime = totalOrigMem = totalOrigLookupTime = 0;
            long totalRepetitions = 0;

            Stopwatch stopWatch = new Stopwatch();

            for (int maxEditDistance = 1; maxEditDistance <= 3; maxEditDistance++)
            {
                for (int prefixLength = 5; prefixLength <= 7; prefixLength++)
                {
                    //benchmark dictionary precalculation size and time
                    //maxEditDistance=1/2/3; prefixLength=5/6/7;  dictionary=30k/82k/500k; class=instantiated/static
                    for (int i = 0; i < DictionaryPath.Length; i++)
                    {
                        totalLoopCount++;

                        //instantiated dictionary
                        long memSize = GC.GetTotalMemory(true);
                        stopWatch.Restart();
                        SymSpell dict = new SymSpell(DictionarySize[i], maxEditDistance, prefixLength);
                        dict.LoadDictionary(DictionaryPath[i], 0, 1);
                        stopWatch.Stop();
                        long memDelta = GC.GetTotalMemory(true) - memSize;
                        totalLoadTime += stopWatch.Elapsed.TotalSeconds;
                        totalMem      += memDelta / 1024.0 / 1024.0;
                        Console.WriteLine("Precalculation instance " + stopWatch.Elapsed.TotalSeconds.ToString("N3") + "s " + (memDelta / 1024.0 / 1024.0).ToString("N1") + "MB " + dict.WordCount.ToString("N0") + " words " + dict.EntryCount.ToString("N0") + " entries  MaxEditDistance=" + maxEditDistance.ToString() + " prefixLength=" + prefixLength.ToString() + " dict=" + DictionaryName[i]);

                        //static dictionary
                        memSize = GC.GetTotalMemory(true);
                        stopWatch.Restart();
                        Original.SymSpell dictOrig = new Original.SymSpell(maxEditDistance, prefixLength);
                        dictOrig.LoadDictionary(DictionaryPath[i], "", 0, 1);
                        stopWatch.Stop();
                        memDelta           = GC.GetTotalMemory(true) - memSize;
                        totalOrigLoadTime += stopWatch.Elapsed.TotalSeconds;
                        totalOrigMem      += memDelta / 1024.0 / 1024.0;
                        Console.WriteLine("Precalculation static   " + stopWatch.Elapsed.TotalSeconds.ToString("N3") + "s " + (memDelta / 1024 / 1024.0).ToString("N1") + "MB " + dictOrig.Count.ToString("N0") + " words " + dictOrig.EntryCount.ToString("N0") + " entries  MaxEditDistance=" + maxEditDistance.ToString() + " prefixLength=" + prefixLength.ToString() + " dict=" + DictionaryName[i]);

                        //benchmark lookup result number and time
                        //maxEditDistance=1/2/3; prefixLength=5/6/7; dictionary=30k/82k/500k; verbosity=0/1/2; query=exact/non-exact/mix; class=instantiated/static
                        foreach (SymSpell.Verbosity verbosity in Enum.GetValues(typeof(SymSpell.Verbosity)))
                        {
                            //instantiated exact
                            stopWatch.Restart();
                            for (int round = 0; round < repetitions; round++)
                            {
                                resultNumber = dict.Lookup("different", verbosity, maxEditDistance).Count;
                            }
                            stopWatch.Stop();
                            totalLookupTime += stopWatch.Elapsed.TotalMilliseconds;
                            totalMatches    += resultNumber;
                            Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=exact");
                            //static exact
                            stopWatch.Restart();
                            for (int round = 0; round < repetitions; round++)
                            {
                                resultNumber = dictOrig.Lookup("different", "", maxEditDistance, (int)verbosity).Count;
                            }
                            stopWatch.Stop();
                            totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds;
                            totalOrigMatches    += resultNumber;
                            Console.WriteLine("Lookup static   " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=exact");
                            Console.WriteLine();
                            totalRepetitions += repetitions;

                            //instantiated non-exact
                            stopWatch.Restart();
                            for (int round = 0; round < repetitions; round++)
                            {
                                resultNumber = dict.Lookup("hockie", verbosity, maxEditDistance).Count;
                            }
                            stopWatch.Stop();
                            totalLookupTime += stopWatch.Elapsed.TotalMilliseconds;
                            totalMatches    += resultNumber;
                            Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=non-exact");
                            //static non-exact
                            stopWatch.Restart();
                            for (int round = 0; round < repetitions; round++)
                            {
                                resultNumber = dictOrig.Lookup("hockie", "", maxEditDistance, (int)verbosity).Count;
                            }
                            stopWatch.Stop();
                            totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds;
                            totalOrigMatches    += resultNumber;
                            Console.WriteLine("Lookup static   " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=non-exact");
                            Console.WriteLine();
                            totalRepetitions += repetitions;

                            //instantiated mix
                            stopWatch.Restart();
                            resultNumber = 0; foreach (var word in query1k)
                            {
                                resultNumber += dict.Lookup(word, verbosity, maxEditDistance).Count;
                            }
                            stopWatch.Stop();
                            totalLookupTime += stopWatch.Elapsed.TotalMilliseconds;
                            totalMatches    += resultNumber;
                            Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / query1k.Length).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=mix");
                            //static mix
                            stopWatch.Restart();
                            resultNumber = 0; foreach (var word in query1k)
                            {
                                resultNumber += dictOrig.Lookup(word, "", maxEditDistance, (int)verbosity).Count;
                            }
                            stopWatch.Stop();
                            totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds;
                            totalOrigMatches    += resultNumber;
                            Console.WriteLine("Lookup static   " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / query1k.Length).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=mix");
                            Console.WriteLine();
                            totalRepetitions += query1k.Length;
                        }
                        Console.WriteLine();

                        dict     = null;
                        dictOrig = null;
                    }
                }
            }
            Console.WriteLine("Average Precalculation time instance " + (totalLoadTime / totalLoopCount).ToString("N3") + "s   " + ((totalLoadTime / totalOrigLoadTime) - 1).ToString("P1"));
            Console.WriteLine("Average Precalculation time static   " + (totalOrigLoadTime / totalLoopCount).ToString("N3") + "s");
            Console.WriteLine("Average Precalculation memory instance " + (totalMem / totalLoopCount).ToString("N1") + "MB " + ((totalMem / totalOrigMem) - 1).ToString("P1"));
            Console.WriteLine("Average Precalculation memory static   " + (totalOrigMem / totalLoopCount).ToString("N1") + "MB");
            Console.WriteLine("Average Lookup time instance " + (totalLookupTime / totalRepetitions).ToString("N3") + "ms          " + ((totalLookupTime / totalOrigLookupTime) - 1).ToString("P1"));
            Console.WriteLine("Average Lookup time static   " + (totalOrigLookupTime / totalRepetitions).ToString("N3") + "ms");
            Console.WriteLine("Total Lookup results instance " + totalMatches.ToString("N0") + "      " + (totalMatches - totalOrigMatches) + " differences");
            Console.WriteLine("Total Lookup results static   " + totalOrigMatches.ToString("N0"));
        }
        private static void Experimento3()
        {
            string strPath = @"D:\json\";

            string[]      fileEntries      = Directory.GetFiles(strPath);
            StringBuilder OCROriginal      = new StringBuilder();
            string        fileName         = @"D:\cuantificacion\Experimentos\experimento3.xlsx";
            string        connectionString = String.Format(@"Provider=Microsoft.ACE.OLEDB.12.0;" +
                                                           "Data Source={0};Extended Properties='Excel 12.0;HDR=YES;IMEX=0'", fileName);
            EditDistanceLength editDistance               = new EditDistanceLength();
            const int          initialCapacity            = 82765;
            const int          maxEditDistance            = 5;
            const int          prefixLength               = 7;
            SymSpell           symSpell                   = new SymSpell(initialCapacity, maxEditDistance, prefixLength);
            Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>();

            foreach (string path in fileEntries)
            {
                string jsonText = File.ReadAllText(path, Encoding.Default);
                var    response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText);
                foreach (var respuestas in response.Responses)
                {
                    var annotation = respuestas.FullTextAnnotation;
                    if (annotation != null)
                    {
                        OCROriginal.Append(annotation.Text);
                    }
                }
            }
            symSpell.LoadDictionary(@"D:\load8.txt", 0, 1);
            List <SymSpell.SuggestItem> suggestions = symSpell.LookupCompound(OCROriginal.ToString(), 2);
            var    arraySymspell = suggestions[0].ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Split(' ');
            var    arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' ');
            int    j = 0, k = 0;
            double similarity;

            for (int i = 0; i < arraySymspell.Length; i++)
            {
                if (j == arrayOCROriginal.Length)
                {
                    break;
                }
                similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j].ToLower());
                ExperimentSpell exp1 = new ExperimentSpell();

                if (similarity == 1)
                {
                    exp1.correction = "igual";
                    exp1.correctionLookupCompound = arraySymspell[i];
                    exp1.original = arrayOCROriginal[j];
                    j++;
                }
                else
                {
                    if (similarity >= .4)
                    {
                        exp1.correction = "Corregida";
                        exp1.correctionLookupCompound = arraySymspell[i];
                        exp1.original = arrayOCROriginal[j];
                        j++;
                    }
                    else
                    {
                        if (similarity > 0.06)
                        {
                            exp1.correction = "Espacios";
                            exp1.correctionLookupCompound = arraySymspell[i];
                            exp1.original = arrayOCROriginal[j];
                        }
                        else
                        {
                            if (j > 0)
                            {
                                similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j - 1].ToLower());
                            }
                            else
                            {
                                similarity = 0;
                            }
                            if (similarity == 1)
                            {
                                j--;
                                exp1.correction = "igual";
                                exp1.correctionLookupCompound = arraySymspell[i];
                                exp1.original = arrayOCROriginal[j];
                            }
                            else
                            {
                                if (similarity >= .4)
                                {
                                    j--;
                                    exp1.correction = "Corregida";
                                    exp1.correctionLookupCompound = arraySymspell[i];
                                    exp1.original = arrayOCROriginal[j];
                                }
                                else
                                {
                                    if (similarity > 0.06)
                                    {
                                        j--;
                                        exp1.correction = "Espacios";
                                        exp1.correctionLookupCompound = arraySymspell[i];
                                        exp1.original = arrayOCROriginal[j];
                                    }
                                    else
                                    {
                                        if (j + 1 < arrayOCROriginal.Length)
                                        {
                                            similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j + 1].ToLower());
                                        }
                                        else
                                        {
                                            similarity = 0;
                                        }

                                        if (similarity == 1)
                                        {
                                            j++;
                                            exp1.correction = "igual";
                                            exp1.correctionLookupCompound = arraySymspell[i];
                                            exp1.original = arrayOCROriginal[j];
                                        }
                                        else
                                        {
                                            if (similarity >= .4)
                                            {
                                                j++;
                                                exp1.correction = "Corregida";
                                                exp1.correctionLookupCompound = arraySymspell[i];
                                                exp1.original = arrayOCROriginal[j];
                                            }
                                            else
                                            {
                                                if (similarity > 0.06)
                                                {
                                                    j++;
                                                    exp1.correction = "Espacios";
                                                    exp1.correctionLookupCompound = arraySymspell[i];
                                                    exp1.original = arrayOCROriginal[j];
                                                }
                                                else
                                                {
                                                    exp1.correction = "Error";
                                                    exp1.correctionLookupCompound = arraySymspell[i];
                                                    exp1.original = arrayOCROriginal[j];
                                                    j++;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                excelMatrix.Add(k++, exp1);
            }
            CreateExcelFileExperimento(excelMatrix, "3");
        }
Exemple #22
0
 public WordCorrection(string path, int MaxEditDistance)
 {
     symSpell = new SymSpell(82765, MaxEditDistance, MaxEditDistance + 1);
     symSpell.LoadDictionary(path, 0, 1);
 }
Exemple #23
0
        private void SpellCorrect_Click(object sender, RoutedEventArgs e)
        {
            ConsoleManager.Show();
            const int initialCapacity = 82765 * 2;
            const int maxEditDistance = 5;
            const int prefixLength    = 7;
            SymSpell  symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);

            long memSize = GC.GetTotalMemory(true);
            // Load a frequency dictionary
            //wordfrequency_en.txt  ensures high correction quality by combining two data sources:
            //Google Books Ngram data  provides representative word frequencies (but contains many entries with spelling errors)
            //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies)
            string path  = @"C:\Users\Emmanuel\source\repos\Project-Carl\Project-CARL\WpfApp1\frequency_dictionary_en_82_765.txt";
            string dict2 = @"C:\Users\Emmanuel\source\repos\Project-Carl\Project-CARL\WpfApp1\unigram_freq.txt";


            long memDelta = GC.GetTotalMemory(true) - memSize;

            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + System.IO.Path.GetFullPath(path));
                Console.ReadKey();
                //return;
            }
            if (!symSpell.LoadDictionary(dict2, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + System.IO.Path.GetFullPath(path));
                Console.ReadKey();
                //return;
            }


            //Open textfile
            String correctionFile = "";

            System.Windows.Forms.MessageBox.Show("Choose file to Correct");
            OpenFileDialog openFileDialog1 = new OpenFileDialog();

            if (openFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK)
            {
                correctionFile = openFileDialog1.FileName;
                System.Windows.Forms.MessageBox.Show(correctionFile);
            }


            //read words into array/list
            string corp = File.ReadAllText(correctionFile, Encoding.UTF8);              //read raw text file

            string[] words = corp.Split(new string[] { " " }, StringSplitOptions.None); //tokenize raw text file
            List <SymSpell.SuggestItem> suggestedWord = null;                           //list of all corrected words
            List <string> correctedWords = new List <string>();                         //Output of the corrected words

            //submit word to symSpell
            for (int i = 0; i < words.Length; i++)
            {
                suggestedWord = (symSpell.Lookup(words[i], SymSpell.Verbosity.Closest));
                correctedWords.Add(suggestedWord.First().term);
            }

            //save words to file
            string fileName = System.IO.Path.GetRandomFileName() + ".txt"; //random file name for our corrected text

            //save the directory of the correction file we selected previously
            string pathString = System.IO.Path.GetDirectoryName(correctionFile);

            // Use Combine again to add the file name to the path.
            pathString = System.IO.Path.Combine(pathString, fileName);

            string tmpstring = "";

            foreach (string word in correctedWords)
            {
                tmpstring += (word + " ");
            }

            File.WriteAllText(pathString, tmpstring);
        }
        private List <string> AddHTMLTags(List <string> chatLines, List <string> finalNameTags)
        {
            int initialCapacity           = 82765;
            int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation
            var symSpellEngine            = new SymSpell(initialCapacity, maxEditDistanceDictionary);

            string baseDirectory  = AppDomain.CurrentDomain.BaseDirectory;
            string dictionaryPath = baseDirectory + "../../frequency_dictionary_en_82_765.txt";
            int    termIndex      = 0; //column of the term in the dictionary text file
            int    countIndex     = 1; //column of the term frequency in the dictionary text file

            if (!symSpellEngine.LoadDictionary(dictionaryPath, termIndex, countIndex))
            {
                Console.WriteLine("File not found!");
            }

            //List<string> nameTags = new List<string>();
            List <string> newHTMLLines = new List <string>();

            foreach (string htmlLines in chatLines)
            {
                string changedHTMLLine = string.Empty;
                foreach (string name in finalNameTags)
                {
                    int startIndex = 0;
                    startIndex = htmlLines.IndexOf(name);
                    string boldTag = "<span style=\"font-weight: bold; color:#000000; \">";
                    if (startIndex > -1 && startIndex < 3)
                    {
                        //Devildogs, ya know
                        string tempHtmlLines     = CheckDerps(htmlLines, name);
                        string spellingHMTLLines = string.Empty;
                        //Fix Erica's bad spelling
                        if ((tempHtmlLines.StartsWith("Lady Red") || tempHtmlLines.StartsWith("LadyRedE") || tempHtmlLines.StartsWith("Carissa T") || tempHtmlLines.StartsWith("PrincessV")))

                        {
                            spellingHMTLLines = FixBadSpelling(tempHtmlLines, symSpellEngine);
                        }
                        else
                        {
                            spellingHMTLLines = tempHtmlLines;
                        }
                        changedHTMLLine = spellingHMTLLines.Insert(startIndex, boldTag);
                        changedHTMLLine = changedHTMLLine.Insert((startIndex + boldTag.Length + name.Length), "</span>");
                        changedHTMLLine = AddCharacterColors(changedHTMLLine, name, startIndex, boldTag);
                    }
                    else
                    {
                        Console.Write(changedHTMLLine);
                    }
                }
                //If not in the namelist
                if (changedHTMLLine.Length < 3)
                {
                    changedHTMLLine = htmlLines;
                }
                changedHTMLLine = ReservedCharacterChangePass(changedHTMLLine);
                changedHTMLLine = StylisticCharacterChangePass(changedHTMLLine);
                newHTMLLines.Add(changedHTMLLine);
            }
            return(newHTMLLines);
        }
Exemple #25
0
        //Load a frequency dictionary or create a frequency dictionary from a text corpus
        public static void Main(string[] args)
        {
            //set parameters
            const int initialCapacity = 82765;
            const int maxEditDistance = 2;
            const int prefixLength    = 7;
            SymSpell  symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);

            string input;
            string path;

            // Console.WriteLine("Test 1");
            // Didn't manage to get it up and running
            // path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/bonjour.txt";
            // if (!SymSpellCompound.CreateDictionary(path, 0, 1)) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path));
            // input = "bonjor";
            // Correct(input, symSpell);
            // Console.WriteLine();


            Console.WriteLine("Test 2");
            //
            path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/bonjour.1.txt";
            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return;
            }
            input = "bonjuor";
            Correct(input, symSpell);
            Console.WriteLine();


            Console.WriteLine("Test 3");
            //
            path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/bonjour.2.txt";
            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return;
            }
            Correct("bonjur hallo", symSpell);
            Console.WriteLine();

            Console.WriteLine("Test 4");
            // breaks with "བཀྲ་ཤས་་", breaks with "བཀྲ་ཤིན་", doesn't recognize "བཀྲ་ཤེས་", or "སཀྲ་ཤིས་"
            path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/tib.txt";
            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return;
            }
            input = "སཀྲ་ཤིས་";
            Correct(input, symSpell);
            Console.WriteLine();

            Console.WriteLine("Test 5");
            path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/tib.1.txt";
            if (!symSpell.LoadDictionary(path, 0, 1))
            {
                Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return;
            }
            input = "དཀྲ'ཤེས'";
            Correct(input, symSpell);
        }
        static void Main(string[] args)
        {
            if (args.Length > 2)
            {
                Console.Error.Write("Creating dictionary ...");
                long      memSize   = GC.GetTotalMemory(true);
                Stopwatch stopWatch = new Stopwatch();
                stopWatch.Start();

                //parameters
                int initialCapacity = 82765;
                int termIndex       = 0; //column of the term in the dictionary text file
                int countIndex      = 1; //column of the term frequency in the dictionary text file

                //dictionaryType
                string dictionaryType = args[0].ToLower();
                if ("load.create".IndexOf(dictionaryType) == -1)
                {
                    Console.Error.WriteLine("Error in parameter 1"); return;
                }

                //dictionaryPath
                string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1];

                //prefix length (optional parameter)
                int    offset       = 0;
                string lookupType   = "";
                int    prefixLength = 7;
                if (!int.TryParse(args[2], out prefixLength))
                {
                    prefixLength = 7;
                }
                else
                {
                    offset = 1;
                }

                //lookupType
                if (args.Length > 2 + offset)
                {
                    lookupType = args[2 + offset].ToLower();
                    if ("lookup.lookupcompound.wordsegment".IndexOf(lookupType) == -1)
                    {
                        Console.Error.WriteLine("Error in parameter " + (3 + offset).ToString()); return;
                    }
                }

                //maxEditDistance
                int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation
                if (args.Length > 3 + offset)
                {
                    if (!int.TryParse(args[3 + offset], out maxEditDistanceDictionary))
                    {
                        Console.Error.WriteLine("Error in parameter " + (4 + offset).ToString()); return;
                    }
                }

                //output stats
                bool outputStats = false;//false, true
                if (args.Length > 4 + offset)
                {
                    if (!bool.TryParse(args[4 + offset], out outputStats))
                    {
                        Console.Error.WriteLine("Error in parameter " + (5 + offset).ToString()); return;
                    }
                }

                //verbosity
                var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All
                if (args.Length > 5 + offset)
                {
                    if (!Enum.TryParse(args[5 + offset], true, out suggestionVerbosity))
                    {
                        Console.Error.WriteLine("Error in parameter " + (6 + offset).ToString()); return;
                    }
                }

                //create object
                var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength);

                //load dictionary
                switch (dictionaryType)
                {
                case "load":
                    if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex))
                    {
                        Console.Error.WriteLine("File not found!");
                        return;
                    }
                    break;

                case "create":
                    if (!symSpell.CreateDictionary(dictionaryPath))
                    {
                        Console.Error.WriteLine("File not found!");
                        return;
                    }
                    break;

                default:
                    break;
                }

                stopWatch.Stop();
                long memDelta = GC.GetTotalMemory(true) - memSize;

                //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped
                Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, "
                                        + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString()
                                        + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms "
                                        + (memDelta / 1024 / 1024.0).ToString("N0") + " MB");

                //warm up
                var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All);

                //lookup suggestions for single-word input strings
                string inputTerm;
                while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim()))
                {
                    switch (lookupType)
                    {
                    case "lookup":
                        var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceDictionary, true);
                        //display suggestions, edit distance and term frequency
                        foreach (var suggestion in suggestions)
                        {
                            if (outputStats)
                            {
                                Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0"));
                            }
                            else
                            {
                                Console.WriteLine(suggestion.term);
                            }
                        }
                        break;

                    case "lookupcompound":
                        var suggestions2 = symSpell.LookupCompound(inputTerm);
                        //display suggestions, edit distance and term frequency
                        foreach (var suggestion in suggestions2)
                        {
                            if (outputStats)
                            {
                                Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0"));
                            }
                            else
                            {
                                Console.WriteLine(suggestion.term);
                            }
                        }
                        break;

                    case "wordsegment":
                        var suggestions3 = symSpell.WordSegmentation(inputTerm);
                        //display suggestions, edit distance and term frequency
                        foreach (var suggestion in suggestions3)
                        {
                            if (outputStats)
                            {
                                Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString());
                            }
                            else
                            {
                                Console.WriteLine(suggestion.correctedString);
                            }
                        }
                        break;

                    default:
                        break;
                    }
                }
            }
            else
            {
                //PrefixLength is number

                //help
                Console.WriteLine("SymSpell.CommandLine DictionaryType DictionaryPath [PrefixLength] LookupType [MaxEditDistance] [OutputStats] [Verbosity]");
                Console.WriteLine();
                Console.WriteLine("DictionaryType=load|create");
                Console.WriteLine("   load: load dictionary from dictionary file");
                Console.WriteLine("   create: create dictionary from text corpus");
                Console.WriteLine("DictionaryPath: path to dictionary/corpus file");
                Console.WriteLine("PrefixLength: default=7 (speed/memory consumption trade-off)");  //dictionary param
                Console.WriteLine("   5: low memory, slow lookup");
                Console.WriteLine("   6: medium memory, medium lookup");
                Console.WriteLine("   7: high memory, fast lookup");
                //lookup intended for correction of single word
                //lookupcompound intended for correction of multiple words, it can insert only a single space per token, faster than wordsegmentation
                //wordsegmentation intended for segmentation and correction of multiple words, it can insert multiple spaces per token, slower than lookupcompound
                Console.WriteLine("LookupType=lookup|lookupcompound|wordsegment");
                Console.WriteLine("   lookup: correct single word");
                Console.WriteLine("   lookupcompound: correct multiple-word string (supports splitting/merging)");
                Console.WriteLine("   wordsegment: word segment and correct input string");
                Console.WriteLine("MaxEditDistance: default=2 (0: no correction, word segmentation only)");
                Console.WriteLine("OutputStats=false|true");
                Console.WriteLine("   false: only corrected string");
                Console.WriteLine("   true: corrected string, edit distance, word frequency/probability");
                Console.WriteLine("Verbosity=top|closest|all"); //no effect for lookupcompound and wordsegment
                Console.WriteLine("   top: Top suggestion");
                Console.WriteLine("   closest: All suggestions of smallest edit distance found");
                Console.WriteLine("   all: All suggestions within maxEditDistance");
                Console.WriteLine();
            }
        }
        private static void Experimento2_1()
        {
            Stopwatch stopWatch = new Stopwatch();
            string    strPath   = @"D:\json\";

            string[]      fileEntries      = Directory.GetFiles(strPath);
            StringBuilder OCROriginal      = new StringBuilder();
            string        fileName         = @"D:\cuantificacion\Experimentos\experimento2.xlsx";
            string        connectionString = String.Format(@"Provider=Microsoft.ACE.OLEDB.12.0;" +
                                                           "Data Source={0};Extended Properties='Excel 12.0;HDR=YES;IMEX=0'", fileName);
            EditDistanceLength editDistance = new EditDistanceLength();
            //Symspell parameters
            const int initialCapacity = 82765;
            const int maxEditDistance = 5;
            const int prefixLength    = 7;
            SymSpell  symSpell        = new SymSpell(initialCapacity, maxEditDistance, prefixLength);
            Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>();

            foreach (string path in fileEntries)
            {
                string jsonText = File.ReadAllText(path, Encoding.Default);
                var    response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText);
                foreach (var respuestas in response.Responses)
                {
                    var annotation = respuestas.FullTextAnnotation;
                    if (annotation != null)
                    {
                        OCROriginal.Append(annotation.Text);
                    }
                }
            }

            stopWatch.Start();
            //load symspell dictionary default
            symSpell.LoadDictionary(@"D:\load8.txt", 0, 1);
            //process symspell
            List <SymSpell.SuggestItem> suggestions = symSpell.LookupCompound(OCROriginal.ToString(), 2);

            stopWatch.Stop();

            var arraySymspell = suggestions[0].ToString().Replace("\n", " ").Replace("}", "").Split(' ');
            var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' ');
            int j = 0, k = 0;

            for (int i = 0; i < arraySymspell.Length; i++)
            {
                ExperimentSpell exp1 = new ExperimentSpell();
                exp1.correction = "igual";
                exp1.correctionLookupCompound = arraySymspell[i];
                if (j < arrayOCROriginal.Length)
                {
                    exp1.original = arrayOCROriginal[j];
                }
                else
                {
                    exp1.original = "";
                }
                j++;
                excelMatrix.Add(k++, exp1);
            }
            CreateExcelFileExperimento(excelMatrix, "2");
        }