예제 #1
0
        private int GetBigramSearchHits(Dictionary <int, string> deHyphenateTokens, CorrectionCandidate candidate, bool isUsingGoogle, out string log)
        {
            Correction correct = new Correction();
            int        hits    = 0;

            log = "";
            string phrase = "";

            if (candidate.Key - 1 >= deHyphenateTokens.Keys.Min())
            {
                phrase = string.Format("{0} {1}", correct.ChangeOldToNewSpell(deHyphenateTokens[candidate.Key - 1]), candidate.Candidate);
                int hit = GetSearchPhraseHits(phrase, isUsingGoogle);
                log  += string.Format("({0},{1})", phrase, hit);
                hits += hit;
            }
            if (deHyphenateTokens.Keys.Max() >= candidate.Key + 1)
            {
                phrase = string.Format("{0} {1}", candidate.Candidate, correct.ChangeOldToNewSpell(deHyphenateTokens[candidate.Key + 1]));
                int hit = GetSearchPhraseHits(phrase, isUsingGoogle);
                log  += string.Format("({0},{1})", phrase, hit);
                hits += hit;
            }
            candidate.Hits = hits;
            return(hits);
        }
예제 #2
0
        public string GetGoogleSuggestionFromTrigram(Dictionary <int, string> deHyphenateTokens, int key, out int totalHits, out string log)
        {
            Correction corr = new Correction();

            log = "";
            Dictionary <int, string> suggestions = new Dictionary <int, string>();
            string suggestion       = deHyphenateTokens[key];
            string googleSuggestion = "";

            totalHits = 0;
            string phrase;

            if (key - 2 >= deHyphenateTokens.Keys.Min() && key - 1 >= deHyphenateTokens.Keys.Min())
            {
                phrase = corr.ChangeOldToNewSpell(string.Format("{0} {1} {2}", deHyphenateTokens[key - 2], deHyphenateTokens[key - 1], deHyphenateTokens[key]));
                int hits = GoogleSearch(phrase, deHyphenateTokens[key], out googleSuggestion);
                if (googleSuggestion != "")
                {
                    log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits);
                    return(googleSuggestion);
                }
                log       += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits);
                totalHits += hits;
            }
            if (key - 1 >= deHyphenateTokens.Keys.Min() && deHyphenateTokens.Keys.Max() >= key + 1)
            {
                phrase = corr.ChangeOldToNewSpell(string.Format("{0} {1} {2}", deHyphenateTokens[key - 1], deHyphenateTokens[key], deHyphenateTokens[key + 1]));
                int hits = GoogleSearch(phrase, deHyphenateTokens[key], out googleSuggestion);
                if (googleSuggestion != "")
                {
                    log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits);
                    return(googleSuggestion);
                }
                log       += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits);
                totalHits += hits;
            }
            if (deHyphenateTokens.Keys.Max() >= key + 1 && deHyphenateTokens.Keys.Max() >= key + 2)
            {
                phrase = corr.ChangeOldToNewSpell(string.Format("{0} {1} {2}", deHyphenateTokens[key], deHyphenateTokens[key + 1], deHyphenateTokens[key + 2]));
                int hits = GoogleSearch(phrase, deHyphenateTokens[key], out googleSuggestion);
                if (googleSuggestion != "")
                {
                    log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits);
                    return(googleSuggestion);
                }
                log       += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits);
                totalHits += hits;
            }
            return(suggestion);
        }
예제 #3
0
        private void btCorrectIt_Click(object sender, EventArgs e)
        {
            if (cbMethod.SelectedItem.ToString() == "-- Choose method --")
            {
                MessageBox.Show("Choose method first", "", MessageBoxButtons.OK);
                return;
            }
            if (!File.Exists(txOCR.Text.Trim()))
            {
                MessageBox.Show("Browse file first", "", MessageBoxButtons.OK);
                return;
            }

            ResetControls(false);

            articleFile = txOCR.Text.Trim();
            Correction correction = new Correction();
            Stemmer    stemmer    = new Stemmer();

            // DeHyphenate and clean text:
            string dehyphenatedText = correction.DeHyphenate(articleFile);

            rtbOCR.Text = dehyphenatedText;

            // for analysis:
            string dehyphenatedTextGT = "";

            if (File.Exists(articleFile.Substring(0, articleFile.Length - 4) + "GT.txt"))
            {
                articleFileGT = articleFile.Substring(0, articleFile.Length - 4) + "GT.txt";
            }
            articleFileName = Path.GetFileName(articleFile);
            if (!string.IsNullOrEmpty(articleFileGT))
            {
                dehyphenatedTextGT = correction.DeHyphenate(articleFileGT);
            }

            // tokenize:
            deHyphenateTokens = correction.GetTokensFromText(dehyphenatedText);

            Regex rgx = new Regex("[^a-zA-Z]"); //omit all non alphabet word And clean word from non alphabet:

            // for analysis:
            Dictionary <int, string> deHyphenateTokensGT = new Dictionary <int, string>();

            if (!string.IsNullOrEmpty(articleFileGT))
            {
                deHyphenateTokensGT = correction.GetTokensFromText(dehyphenatedTextGT);
                foreach (KeyValuePair <int, string> token in deHyphenateTokens)
                {
                    correction.InsertOCRAndTruth(articleFileName, token.Key, rgx.Replace(token.Value, ""), rgx.Replace(deHyphenateTokensGT[token.Key], ""));
                }
            }

            // Omit non character,single char, All Capitals word, and clean word from non alphabet:
            var tmp = deHyphenateTokens.Where(p => p.Value.Length > 1).ToDictionary(p => p.Key, p => p.Value);

            tmp = tmp.Where(p => p.Value.Any(Char.IsLetter)).ToDictionary(p => p.Key, p => rgx.Replace(p.Value, ""));
            Dictionary <int, string> cleanTokens = tmp.Where(p => !p.Value.All(Char.IsUpper)).ToDictionary(p => p.Key, p => p.Value);



            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Hunspell"))
            {
                string hunspellLog = "";
                // find Suggestion using Hunspell:
                foreach (KeyValuePair <int, string> err in cleanTokens)
                {
                    string        errInNewSpell       = correction.ChangeOldToNewSpell(err.Value).ToLowerInvariant();
                    List <string> hunspellSuggestions = new List <string>();
                    using (SpellEngine engine = new SpellEngine())
                    {
                        LanguageConfig idConfig = new LanguageConfig();
                        idConfig.LanguageCode     = "id";
                        idConfig.HunspellAffFile  = "id_ID.aff";
                        idConfig.HunspellDictFile = "id_ID.dic";
                        idConfig.HunspellKey      = "";
                        engine.AddLanguage(idConfig);
                        bool correct = engine["id"].Spell(errInNewSpell);
                        if (!correct)
                        {
                            hunspellSuggestions = engine["id"].Suggest(errInNewSpell);
                            if (hunspellSuggestions.Count > 0 && err.Value != correction.ChangeNewToOldSpell(hunspellSuggestions[0]))
                            {
                                deHyphenateTokens[err.Key] = "[" + correction.ChangeNewToOldSpell(hunspellSuggestions[0]) + "]";
                            }
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), rgx.Replace(deHyphenateTokens[err.Key], "") }, { getFieldNameFromOption().Replace("Correction", "Log"), hunspellLog }
                                });
                            }
                        }
                        else
                        {
                            // for analysis:
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), err.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), err.Value + " is correct" }
                                });
                            }
                        }
                    }
                }
                ResetControls(true);
                return;
            }


            //check only unique word (assumption:duplicate word is correct word) :
            Dictionary <int, string> checkTokens = cleanTokens;
            var duplicateValues = checkTokens.GroupBy(x => x.Value).Where(x => x.Count() > 1);

            List <int> duplicateKeys = new List <int>();

            foreach (var item in checkTokens)
            {
                foreach (var dup in duplicateValues)
                {
                    if (item.Value == dup.Key)
                    {
                        duplicateKeys.Add(item.Key);
                    }
                }
            }
            foreach (var dupkey in duplicateKeys)
            {
                // for analysis
                if (!string.IsNullOrEmpty(articleFileGT))
                {
                    correction.UpdateFields(articleFileName, dupkey, new Dictionary <string, string> {
                        { "NCorrection", checkTokens[dupkey] }, { "NLog", "Duplicate" }, { "Correction", checkTokens[dupkey] }, { "Log", "Duplicate" }, { "WOSearchCorrection", checkTokens[dupkey] }, { "WOSearchLog", "Duplicate" }, { "WOStemCorrection", checkTokens[dupkey] }, { "WOStemLog", "Duplicate" }, { "WOStemSearchCorrection", checkTokens[dupkey] }, { "WOStemSearchLog", "Duplicate" }, { "GooglePureCorrection", checkTokens[dupkey] }, { "GooglePureLog", "Duplicate" }
                    });
                }
                checkTokens.Remove(dupkey);
            }


            //Check Word using Dictionary(kbbi+kompas pilihan, entitas kota,negara, nama pahlawan dari wiki ):
            errors = new Dictionary <int, string>();
            foreach (KeyValuePair <int, string> token in checkTokens)
            {
                // change Soewandi to Modern Spelling:
                string wordToCheck = correction.ChangeOldToNewSpell(token.Value).ToLowerInvariant();

                // check word in Dictionary and Add to Error list if not there:
                int frequency;
                if (!correction.CheckUnigram(wordToCheck, getSQLQueryToCheckUnigram(), out frequency))
                {
                    if (cbMethod.SelectedItem.ToString().Contains("Stemmer"))
                    {
                        // check again its stem in dictionary :
                        string stem = stemmer.Stemming(wordToCheck);
                        if (wordToCheck != stem && stemmer.checkStem(stem))
                        {
                            // for analysis
                            if (!string.IsNullOrEmpty(articleFileGT))
                            {
                                correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                                    { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), stem + " is word" }
                                });
                            }
                        }
                        else // jika tidak ada di kamus:
                        {
                            errors.Add(token.Key, wordToCheck);
                        }
                    }
                    else
                    {
                        errors.Add(token.Key, wordToCheck);
                    }
                }
                else // jika ada di kamus:
                {
                    // for analysis
                    if (!string.IsNullOrEmpty(articleFileGT))
                    {
                        correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), wordToCheck + " is correct" }
                        });
                    }
                }
            }


            // Find Suggestion:
            if (cbMethod.SelectedItem.ToString().EndsWith("Google"))
            {
                timerGoogle.Enabled = true;
                indexTimerGoogle    = 0;
                return;
            }
            else
            {
                foreach (KeyValuePair <int, string> err in errors)
                {
                    //get suggestion:
                    string log; string suggestion;
                    suggestion = correction.GetSuggestion(getSPNameForGetCandidates(), deHyphenateTokens, err.Key, err.Value, out log, getWithStemAndAffixCorrParamFromOption(), getWithSearchParamFromOption());

                    // Change suggestion back to Old Spell if any suggestions:
                    if (log != "No candidates")
                    {
                        suggestion = correction.ChangeNewToOldSpell(suggestion);
                    }

                    // update token dic with suggestion:
                    if (!suggestion.Equals(deHyphenateTokens[err.Key], StringComparison.OrdinalIgnoreCase))
                    {
                        deHyphenateTokens[err.Key] = "[" + suggestion + "]";
                    }

                    // for analysis:
                    if (!string.IsNullOrEmpty(articleFileGT))
                    {
                        correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> {
                            { getFieldNameFromOption(), suggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), log }
                        });
                    }
                }
                ResetControls(true);
            }
        }
예제 #4
0
        public List <CorrectionCandidate> GetCandidatesFromAffixCorrection(int key, string error, out string log)
        {
            //[prefix-1] + [prefix-2] + root + [suffix] + [possessive] + [particle]
            //1.Particles: -lah, -kah, -pun, -tah.
            //2.Possessives: -ku, -mu, -nya.
            //3.Suffixes: -i, -an, -kan.
            //4.Prefixes: meN -, beN -, peN -, teN -, di -, ke -, se -.
            log = "";
            List <string> Prefixs = new List <string>()
            {
                "", "di", "ke", "se", "ber", "bel", "be", "te", "ter", "me", "mem", "men", "meng", " menge", "meny", "pe", "per", "pem", "pen", "peng", "penge", "peny", "pel", "memper", "mempel", "menter", "member", "diper", "diter", "dipel", " diber", "keber", "keter"
            };
            List <string> baseSuffixes = new List <string>()
            {
                "i", "an", "kan"
            };
            List <string> possessives = new List <string>()
            {
                "ku", "mu", "nya"
            };
            List <string> particles = new List <string>()
            {
                "lah", "kah", "pun", "tah"
            };
            //List<string> akhirans = new List<string>() { "", "i", "an", "kan", "ku", "mu", "nja", "lah", "kah", "pun", "tah", "iku", "imu", "inja", "anku", "anmu", "annja", "kanku", "kanmu", "kannja", "ilah", "ikah", "ipun", "itah", "anlah", "ankah", "anpun", "antah", "kanlah", "kankah", "kanpun", "kantah", "kulah", "kukah", "kupun", "kutah", " mulah", "mukah", "mupun", "mutah", " nyalah", "nyakah", "nyapun", "nyatah", "ikulah", "ikukah", "ikupun", "ikutah", "imulah", "imukah", "imupun", "imutah", "inyalah", "inyakah", "inyapun", "inyatah", "ankulah", "ankukah", "ankupun", "ankutah", "anmulah", "anmukah", "anmupun", "anmutah", "annyalah", "annyakah", "annyapun", "annyatah", "kankulah", "kankukah", "kankupun", "kankutah", "kanmulah", "kanmukah", "kanmupun", "kanmutah", "kannjalah", "kannjakah", "kannjapun", "kannjatah" };
            List <string> suffixes = new List <string>()
            {
                ""
            };

            suffixes.AddRange(baseSuffixes);
            suffixes.AddRange(possessives);
            suffixes.AddRange(particles);

            foreach (string s in baseSuffixes)
            {
                foreach (string po in possessives)
                {
                    suffixes.Add(s + po);  // contoh: diperbaikinya
                    foreach (string pa in particles)
                    {
                        suffixes.Add(s + pa); //contoh:dipelukanmulah
                    }
                }
            }

            foreach (string s in baseSuffixes)
            {
                foreach (string pa in particles)
                {
                    suffixes.Add(s + pa);   // contoh: pertahankanlah
                }
            }


            List <CorrectionCandidate> candidates    = new List <CorrectionCandidate>();
            Dictionary <string, int>   dicCandidates = new Dictionary <string, int>();
            Correction correct = new Correction();

            string rootWord = GetRootWord(correct.ChangeOldToNewSpell(error));

            if (rootWord == "" || rootWord.Length < 3)
            {
                return(candidates);
            }

            foreach (string prefix in Prefixs)
            {
                foreach (string suffix in suffixes)
                {
                    Affixer affixer     = new Affixer();
                    string  candidate   = correct.ChangeNewToOldSpell(affixer.Affixing(correct.ChangeNewToOldSpell(rootWord), prefix, suffix));
                    int     levenshtein = EditDistance.LevenshteinDistance(candidate, error, 2);
                    if (levenshtein != -1 && levenshtein <= 2)
                    {
                        if (!dicCandidates.ContainsKey(correct.ChangeOldToNewSpell(candidate)))
                        {
                            dicCandidates.Add(correct.ChangeOldToNewSpell(candidate), levenshtein);
                        }
                    }
                }
            }
            if (dicCandidates.Count == 0)
            {
                return(candidates);
            }

            Dictionary <string, int> dicCandidateAndFreq = GetFrequencies(dicCandidates.Keys.ToArray());

            foreach (KeyValuePair <string, int> can in dicCandidates)
            {
                int frequency = 0;
                if (dicCandidateAndFreq.ContainsKey(can.Key))
                {
                    frequency = dicCandidateAndFreq[can.Key];
                }
                CorrectionCandidate corrcandidate = new CorrectionCandidate
                {
                    Key              = key,
                    Error            = error,
                    Candidate        = can.Key,
                    SameBigramAmount = -1,
                    Frequency        = frequency,
                    LengthDifference = Math.Abs(can.Key.Length - correct.ChangeOldToNewSpell(error).Length),
                    Levensthein      = can.Value
                };
                candidates.Add(corrcandidate);
                log += can.Key + "," + can.Value + "," + frequency.ToString() + ";";
            }

            if (log.Length > 0)
            {
                log = "[" + log + "]";
            }
            return(candidates);
        }