Beispiel #1
0
        private void timerGoogle_Tick(object sender, EventArgs e)
        {
            timerGoogle.Enabled = false;

            Correction correction = new Correction();
            // find Suggestion using Google:

            int Key = errors.ElementAtOrDefault(indexTimerGoogle).Key;

            if (Key == 0)
            {
                ResetControls(true);
                return;
            }

            string logGoogle; int totalHits;
            string googleSuggestion = correction.GetGoogleSuggestionFromTrigram(deHyphenateTokens, Key, out totalHits, out logGoogle);

            // jika suggestion sama dengan err & totalhits kosong (artinya pencarian tidak ada hasil) coba search bigram:
            if (googleSuggestion == deHyphenateTokens[Key] && totalHits == 0)
            {
                string logGoogleBigram;
                googleSuggestion = correction.GetGoogleSuggestionFromBigram(deHyphenateTokens, Key, out totalHits, out logGoogleBigram);
                logGoogle       += logGoogleBigram;
            }
            // jika suggestion berbeda dg err maka Change suggestion back to Old Spell:
            if (googleSuggestion != deHyphenateTokens[Key])
            {
                googleSuggestion = correction.ChangeNewToOldSpell(googleSuggestion);
                // update token dic with suggestion:
                deHyphenateTokens[Key] = "[" + googleSuggestion + "]";
            }
            else
            {
                googleSuggestion = deHyphenateTokens[Key];
                logGoogle        = deHyphenateTokens[Key] + " is correct";
            }

            // for analysis:
            if (!string.IsNullOrEmpty(articleFileGT))
            {
                correction.UpdateFields(articleFileName, Key, new Dictionary <string, string> {
                    { getFieldNameFromOption(), googleSuggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), logGoogle }
                });
            }

            indexTimerGoogle++;
            Random r        = new Random();
            int    interval = r.Next(7000, 13000);

            timerGoogle.Interval = interval;
            timerGoogle.Enabled  = true;
        }
Beispiel #2
0
        private void btCorrectIt_Click(object sender, EventArgs e)
        {
            if (cbMethod.SelectedItem.ToString() == "-- Choose method --")
            {
                MessageBox.Show("Choose method first", "", MessageBoxButtons.OK);
                return;
            }
            if (!File.Exists(txOCR.Text.Trim()))
            {
                MessageBox.Show("Browse file first", "", MessageBoxButtons.OK);
                return;
            }

            ResetControls(false);

            articleFile = txOCR.Text.Trim();
            Correction correction = new Correction();
            Stemmer    stemmer    = new Stemmer();

            // DeHyphenate and clean text:
            string dehyphenatedText = correction.DeHyphenate(articleFile);

            rtbOCR.Text = dehyphenatedText;

            // for analysis:
            string dehyphenatedTextGT = "";

            if (File.Exists(articleFile.Substring(0, articleFile.Length - 4) + "GT.txt"))
            {
                articleFileGT = articleFile.Substring(0, articleFile.Length - 4) + "GT.txt";
            }
            articleFileName = Path.GetFileName(articleFile);
            if (!string.IsNullOrEmpty(articleFileGT))
            {
                dehyphenatedTextGT = correction.DeHyphenate(articleFileGT);
            }

            // tokenize:
            deHyphenateTokens = correction.GetTokensFromText(dehyphenatedText);

            Regex rgx = new Regex("[^a-zA-Z]"); //omit all non alphabet word And clean word from non alphabet:

            // for analysis:
            Dictionary <int, string> deHyphenateTokensGT = new Dictionary <int, string>();

            if (!string.IsNullOrEmpty(articleFileGT))
            {
                deHyphenateTokensGT = correction.GetTokensFromText(dehyphenatedTextGT);
                foreach (KeyValuePair <int, string> token in deHyphenateTokens)
                {
                    correction.InsertOCRAndTruth(articleFileName, token.Key, rgx.Replace(token.Value, ""), rgx.Replace(deHyphenateTokensGT[token.Key], ""));
                }
            }

            // Omit non character,single char, All Capitals word, and clean word from non alphabet:
            var tmp = deHyphenateTokens.Where(p => p.Value.Length > 1).ToDictionary(p => p.Key, p => p.Value);

            tmp = tmp.Where(p => p.Value.Any(Char.IsLetter)).ToDictionary(p => p.Key, p => rgx.Replace(p.Value, ""));
            Dictionary <int, string> cleanTokens = tmp.Where(p => !p.Value.All(Char.IsUpper)).ToDictionary(p => p.Key, p => p.Value);



            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Hunspell"))
            {
                string hunspellLog = "";
                // find Suggestion using Hunspell:
                foreach (KeyValuePair <int, string> err in cleanTokens)
                {
                    string        errInNewSpell       = correction.ChangeOldToNewSpell(err.Value).ToLowerInvariant();
                    List <string> hunspellSuggestions = new List <string>();
                    using (SpellEngine engine = new SpellEngine())
                    {
                        LanguageConfig idConfig = new LanguageConfig();
                        idConfig.LanguageCode     = "id";
                        idConfig.HunspellAffFile  = "id_ID.aff";
                        idConfig.HunspellDictFile = "id_ID.dic";
                        idConfig.HunspellKey      = "";
                        engine.AddLanguage(idConfig);
                        bool correct = engine["id"].Spell(errInNewSpell);
                        if (!correct)
                        {
                            hunspellSuggestions = engine["id"].Suggest(errInNewSpell);
                            if (hunspellSuggestions.Count > 0 && err.Value != correction.ChangeNewToOldSpell(hunspellSuggestions[0]))
                            {
                                deHyphenateTokens[err.Key] = "[" + correction.ChangeNewToOldSpell(hunspellSuggestions[0]) + "]";
                            }
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), rgx.Replace(deHyphenateTokens[err.Key], "") }, { getFieldNameFromOption().Replace("Correction", "Log"), hunspellLog }
                                });
                            }
                        }
                        else
                        {
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), err.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), err.Value + " is correct" }
                                });
                            }
                        }
                    }
                }
                ResetControls(true);
                return;
            }


            //check only unique word (assumption:duplicate word is correct word) :
            Dictionary <int, string> checkTokens = cleanTokens;
            var duplicateValues = checkTokens.GroupBy(x => x.Value).Where(x => x.Count() > 1);

            List <int> duplicateKeys = new List <int>();

            foreach (var item in checkTokens)
            {
                foreach (var dup in duplicateValues)
                {
                    if (item.Value == dup.Key)
                    {
                        duplicateKeys.Add(item.Key);
                    }
                }
            }
            foreach (var dupkey in duplicateKeys)
            {
                // for analysis
                if (!string.IsNullOrEmpty(articleFileGT))
                {
                    correction.UpdateFields(articleFileName, dupkey, new Dictionary <string, string> {
                        { "NCorrection", checkTokens[dupkey] }, { "NLog", "Duplicate" }, { "Correction", checkTokens[dupkey] }, { "Log", "Duplicate" }, { "WOSearchCorrection", checkTokens[dupkey] }, { "WOSearchLog", "Duplicate" }, { "WOStemCorrection", checkTokens[dupkey] }, { "WOStemLog", "Duplicate" }, { "WOStemSearchCorrection", checkTokens[dupkey] }, { "WOStemSearchLog", "Duplicate" }, { "GooglePureCorrection", checkTokens[dupkey] }, { "GooglePureLog", "Duplicate" }
                    });
                }
                checkTokens.Remove(dupkey);
            }


            //Check Word using Dictionary(kbbi+kompas pilihan, entitas kota,negara, nama pahlawan dari wiki ):
            errors = new Dictionary <int, string>();
            foreach (KeyValuePair <int, string> token in checkTokens)
            {
                // change Soewandi to Modern Spelling:
                string wordToCheck = correction.ChangeOldToNewSpell(token.Value).ToLowerInvariant();

                // check word in Dictionary and Add to Error list if not there:
                int frequency;
                if (!correction.CheckUnigram(wordToCheck, getSQLQueryToCheckUnigram(), out frequency))
                {
                    if (cbMethod.SelectedItem.ToString().Contains("Stemmer"))
                    {
                        // check again its stem in dictionary :
                        string stem = stemmer.Stemming(wordToCheck);
                        if (wordToCheck != stem && stemmer.checkStem(stem))
                        {
                            // for analysis
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), stem + " is word" }
                                });
                            }
                        }
                        else // jika tidak ada di kamus:
                        {
                            errors.Add(token.Key, wordToCheck);
                        }
                    }
                    else
                    {
                        errors.Add(token.Key, wordToCheck);
                    }
                }
                else // jika ada di kamus:
                {
                    // for analysis
                    if (!string.IsNullOrEmpty(articleFileGT))
                    {
                        correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), wordToCheck + " is correct" }
                        });
                    }
                }
            }


            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Google"))
            {
                timerGoogle.Enabled = true;
                indexTimerGoogle    = 0;
                return;
            }
            else
            {
                foreach (KeyValuePair <int, string> err in errors)
                {
                    //get suggestion:
                    string log; string suggestion;
                    suggestion = correction.GetSuggestion(getSPNameForGetCandidates(), deHyphenateTokens, err.Key, err.Value, out log, getWithStemAndAffixCorrParamFromOption(), getWithSearchParamFromOption());

                    // Change suggestion back to Old Spell if any suggestions:
                    if (log != "No candidates")
                    {
                        suggestion = correction.ChangeNewToOldSpell(suggestion);
                    }

                    // update token dic with suggestion:
                    if (!suggestion.Equals(deHyphenateTokens[err.Key], StringComparison.OrdinalIgnoreCase))
                    {
                        deHyphenateTokens[err.Key] = "[" + suggestion + "]";
                    }

                    // for analysis:
                    if (!string.IsNullOrEmpty(articleFileGT))
                    {
                        correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), suggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), log }
                        });
                    }
                }
                ResetControls(true);
            }
        }