예제 #1
        private void timerGoogle_Tick(object sender, EventArgs e)
            timerGoogle.Enabled = false;

            Correction correction = new Correction();
            // find Suggestion using Google:

            int Key = errors.ElementAtOrDefault(indexTimerGoogle).Key;

            if (Key == 0)

            string logGoogle; int totalHits;
            string googleSuggestion = correction.GetGoogleSuggestionFromTrigram(deHyphenateTokens, Key, out totalHits, out logGoogle);

            // jika suggestion sama dengan err & totalhits kosong (artinya pencarian tidak ada hasil) coba search bigram:
            if (googleSuggestion == deHyphenateTokens[Key] && totalHits == 0)
                string logGoogleBigram;
                googleSuggestion = correction.GetGoogleSuggestionFromBigram(deHyphenateTokens, Key, out totalHits, out logGoogleBigram);
                logGoogle       += logGoogleBigram;
            // jika suggestion berbeda dg err maka Change suggestion back to Old Spell:
            if (googleSuggestion != deHyphenateTokens[Key])
                googleSuggestion = correction.ChangeNewToOldSpell(googleSuggestion);
                // update token dic with suggestion:
                deHyphenateTokens[Key] = "[" + googleSuggestion + "]";
                googleSuggestion = deHyphenateTokens[Key];
                logGoogle        = deHyphenateTokens[Key] + " is correct";

            // for analysis:
            if (!string.IsNullOrEmpty(articleFileGT))
                correction.UpdateFields(articleFileName, Key, new Dictionary <string, string> {
                    { getFieldNameFromOption(), googleSuggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), logGoogle }

            Random r        = new Random();
            int    interval = r.Next(7000, 13000);

            timerGoogle.Interval = interval;
            timerGoogle.Enabled  = true;
예제 #2
        private void btCorrectIt_Click(object sender, EventArgs e)
            if (cbMethod.SelectedItem.ToString() == "-- Choose method --")
                MessageBox.Show("Choose method first", "", MessageBoxButtons.OK);
            if (!File.Exists(txOCR.Text.Trim()))
                MessageBox.Show("Browse file first", "", MessageBoxButtons.OK);


            articleFile = txOCR.Text.Trim();
            Correction correction = new Correction();
            Stemmer    stemmer    = new Stemmer();

            // DeHyphenate and clean text:
            string dehyphenatedText = correction.DeHyphenate(articleFile);

            rtbOCR.Text = dehyphenatedText;

            // for analysis:
            string dehyphenatedTextGT = "";

            if (File.Exists(articleFile.Substring(0, articleFile.Length - 4) + "GT.txt"))
                articleFileGT = articleFile.Substring(0, articleFile.Length - 4) + "GT.txt";
            articleFileName = Path.GetFileName(articleFile);
            if (!string.IsNullOrEmpty(articleFileGT))
                dehyphenatedTextGT = correction.DeHyphenate(articleFileGT);

            // tokenize:
            deHyphenateTokens = correction.GetTokensFromText(dehyphenatedText);

            Regex rgx = new Regex("[^a-zA-Z]"); //omit all non alphabet word And clean word from non alphabet:

            // for analysis:
            Dictionary <int, string> deHyphenateTokensGT = new Dictionary <int, string>();

            if (!string.IsNullOrEmpty(articleFileGT))
                deHyphenateTokensGT = correction.GetTokensFromText(dehyphenatedTextGT);
                foreach (KeyValuePair <int, string> token in deHyphenateTokens)
                    correction.InsertOCRAndTruth(articleFileName, token.Key, rgx.Replace(token.Value, ""), rgx.Replace(deHyphenateTokensGT[token.Key], ""));

            // Omit non character,single char, All Capitals word, and clean word from non alphabet:
            var tmp = deHyphenateTokens.Where(p => p.Value.Length > 1).ToDictionary(p => p.Key, p => p.Value);

            tmp = tmp.Where(p => p.Value.Any(Char.IsLetter)).ToDictionary(p => p.Key, p => rgx.Replace(p.Value, ""));
            Dictionary <int, string> cleanTokens = tmp.Where(p => !p.Value.All(Char.IsUpper)).ToDictionary(p => p.Key, p => p.Value);

            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Hunspell"))
                string hunspellLog = "";
                // find Suggestion using Hunspell:
                foreach (KeyValuePair <int, string> err in cleanTokens)
                    string        errInNewSpell       = correction.ChangeOldToNewSpell(err.Value).ToLowerInvariant();
                    List <string> hunspellSuggestions = new List <string>();
                    using (SpellEngine engine = new SpellEngine())
                        LanguageConfig idConfig = new LanguageConfig();
                        idConfig.LanguageCode     = "id";
                        idConfig.HunspellAffFile  = "id_ID.aff";
                        idConfig.HunspellDictFile = "id_ID.dic";
                        idConfig.HunspellKey      = "";
                        bool correct = engine["id"].Spell(errInNewSpell);
                        if (!correct)
                            hunspellSuggestions = engine["id"].Suggest(errInNewSpell);
                            if (hunspellSuggestions.Count > 0 && err.Value != correction.ChangeNewToOldSpell(hunspellSuggestions[0]))
                                deHyphenateTokens[err.Key] = "[" + correction.ChangeNewToOldSpell(hunspellSuggestions[0]) + "]";
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), rgx.Replace(deHyphenateTokens[err.Key], "") }, { getFieldNameFromOption().Replace("Correction", "Log"), hunspellLog }
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), err.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), err.Value + " is correct" }

            //check only unique word (assumption:duplicate word is correct word) :
            Dictionary <int, string> checkTokens = cleanTokens;
            var duplicateValues = checkTokens.GroupBy(x => x.Value).Where(x => x.Count() > 1);

            List <int> duplicateKeys = new List <int>();

            foreach (var item in checkTokens)
                foreach (var dup in duplicateValues)
                    if (item.Value == dup.Key)
            foreach (var dupkey in duplicateKeys)
                // for analysis
                if (!string.IsNullOrEmpty(articleFileGT))
                    correction.UpdateFields(articleFileName, dupkey, new Dictionary <string, string> {
                        { "NCorrection", checkTokens[dupkey] }, { "NLog", "Duplicate" }, { "Correction", checkTokens[dupkey] }, { "Log", "Duplicate" }, { "WOSearchCorrection", checkTokens[dupkey] }, { "WOSearchLog", "Duplicate" }, { "WOStemCorrection", checkTokens[dupkey] }, { "WOStemLog", "Duplicate" }, { "WOStemSearchCorrection", checkTokens[dupkey] }, { "WOStemSearchLog", "Duplicate" }, { "GooglePureCorrection", checkTokens[dupkey] }, { "GooglePureLog", "Duplicate" }

            //Check Word using Dictionary(kbbi+kompas pilihan, entitas kota,negara, nama pahlawan dari wiki ):
            errors = new Dictionary <int, string>();
            foreach (KeyValuePair <int, string> token in checkTokens)
                // change Soewandi to Modern Spelling:
                string wordToCheck = correction.ChangeOldToNewSpell(token.Value).ToLowerInvariant();

                // check word in Dictionary and Add to Error list if not there:
                int frequency;
                if (!correction.CheckUnigram(wordToCheck, getSQLQueryToCheckUnigram(), out frequency))
                    if (cbMethod.SelectedItem.ToString().Contains("Stemmer"))
                        // check again its stem in dictionary :
                        string stem = stemmer.Stemming(wordToCheck);
                        if (wordToCheck != stem && stemmer.checkStem(stem))
                            // for analysis
                            if (!string.IsNullOrEmpty(articleFileGT))
                                correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), stem + " is word" }
                        else // jika tidak ada di kamus:
                            errors.Add(token.Key, wordToCheck);
                        errors.Add(token.Key, wordToCheck);
                else // jika ada di kamus:
                    // for analysis
                    if (!string.IsNullOrEmpty(articleFileGT))
                        correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), wordToCheck + " is correct" }

            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Google"))
                timerGoogle.Enabled = true;
                indexTimerGoogle    = 0;
                foreach (KeyValuePair <int, string> err in errors)
                    //get suggestion:
                    string log; string suggestion;
                    suggestion = correction.GetSuggestion(getSPNameForGetCandidates(), deHyphenateTokens, err.Key, err.Value, out log, getWithStemAndAffixCorrParamFromOption(), getWithSearchParamFromOption());

                    // Change suggestion back to Old Spell if any suggestions:
                    if (log != "No candidates")
                        suggestion = correction.ChangeNewToOldSpell(suggestion);

                    // update token dic with suggestion:
                    if (!suggestion.Equals(deHyphenateTokens[err.Key], StringComparison.OrdinalIgnoreCase))
                        deHyphenateTokens[err.Key] = "[" + suggestion + "]";

                    // for analysis:
                    if (!string.IsNullOrEmpty(articleFileGT))
                        correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), suggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), log }