private int GetBigramSearchHits(Dictionary <int, string> deHyphenateTokens, CorrectionCandidate candidate, bool isUsingGoogle, out string log) { Correction correct = new Correction(); int hits = 0; log = ""; string phrase = ""; if (candidate.Key - 1 >= deHyphenateTokens.Keys.Min()) { phrase = string.Format("{0} {1}", correct.ChangeOldToNewSpell(deHyphenateTokens[candidate.Key - 1]), candidate.Candidate); int hit = GetSearchPhraseHits(phrase, isUsingGoogle); log += string.Format("({0},{1})", phrase, hit); hits += hit; } if (deHyphenateTokens.Keys.Max() >= candidate.Key + 1) { phrase = string.Format("{0} {1}", candidate.Candidate, correct.ChangeOldToNewSpell(deHyphenateTokens[candidate.Key + 1])); int hit = GetSearchPhraseHits(phrase, isUsingGoogle); log += string.Format("({0},{1})", phrase, hit); hits += hit; } candidate.Hits = hits; return(hits); }
public string GetGoogleSuggestionFromTrigram(Dictionary <int, string> deHyphenateTokens, int key, out int totalHits, out string log) { Correction corr = new Correction(); log = ""; Dictionary <int, string> suggestions = new Dictionary <int, string>(); string suggestion = deHyphenateTokens[key]; string googleSuggestion = ""; totalHits = 0; string phrase; if (key - 2 >= deHyphenateTokens.Keys.Min() && key - 1 >= deHyphenateTokens.Keys.Min()) { phrase = corr.ChangeOldToNewSpell(string.Format("{0} {1} {2}", deHyphenateTokens[key - 2], deHyphenateTokens[key - 1], deHyphenateTokens[key])); int hits = GoogleSearch(phrase, deHyphenateTokens[key], out googleSuggestion); if (googleSuggestion != "") { log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits); return(googleSuggestion); } log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits); totalHits += hits; } if (key - 1 >= deHyphenateTokens.Keys.Min() && deHyphenateTokens.Keys.Max() >= key + 1) { phrase = corr.ChangeOldToNewSpell(string.Format("{0} {1} {2}", deHyphenateTokens[key - 1], deHyphenateTokens[key], deHyphenateTokens[key + 1])); int hits = GoogleSearch(phrase, deHyphenateTokens[key], out googleSuggestion); if (googleSuggestion != "") { log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits); return(googleSuggestion); } log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits); totalHits += hits; } if (deHyphenateTokens.Keys.Max() >= key + 1 && deHyphenateTokens.Keys.Max() >= key + 2) { phrase = corr.ChangeOldToNewSpell(string.Format("{0} {1} {2}", deHyphenateTokens[key], deHyphenateTokens[key + 1], deHyphenateTokens[key + 2])); int hits = GoogleSearch(phrase, deHyphenateTokens[key], out googleSuggestion); if (googleSuggestion != "") { log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits); return(googleSuggestion); } log += string.Format("({0},'{1}',{2})", googleSuggestion, phrase, hits); totalHits += hits; } return(suggestion); }
private void btCorrectIt_Click(object sender, EventArgs e) { if (cbMethod.SelectedItem.ToString() == "-- Choose method --") { MessageBox.Show("Choose method first", "", MessageBoxButtons.OK); return; } if (!File.Exists(txOCR.Text.Trim())) { MessageBox.Show("Browse file first", "", MessageBoxButtons.OK); return; } ResetControls(false); articleFile = txOCR.Text.Trim(); Correction correction = new Correction(); Stemmer stemmer = new Stemmer(); // DeHyphenate and clean text: string dehyphenatedText = correction.DeHyphenate(articleFile); rtbOCR.Text = dehyphenatedText; // for analysis: string dehyphenatedTextGT = ""; if (File.Exists(articleFile.Substring(0, articleFile.Length - 4) + "GT.txt")) { articleFileGT = articleFile.Substring(0, articleFile.Length - 4) + "GT.txt"; } articleFileName = Path.GetFileName(articleFile); if (!string.IsNullOrEmpty(articleFileGT)) { dehyphenatedTextGT = correction.DeHyphenate(articleFileGT); } // tokenize: deHyphenateTokens = correction.GetTokensFromText(dehyphenatedText); Regex rgx = new Regex("[^a-zA-Z]"); //omit all non alphabet word And clean word from non alphabet: // for analysis: Dictionary <int, string> deHyphenateTokensGT = new Dictionary <int, string>(); if (!string.IsNullOrEmpty(articleFileGT)) { deHyphenateTokensGT = correction.GetTokensFromText(dehyphenatedTextGT); foreach (KeyValuePair <int, string> token in deHyphenateTokens) { correction.InsertOCRAndTruth(articleFileName, token.Key, rgx.Replace(token.Value, ""), rgx.Replace(deHyphenateTokensGT[token.Key], "")); } } // Omit non character,single char, All Capitals word, and clean word from non alphabet: var tmp = deHyphenateTokens.Where(p => p.Value.Length > 1).ToDictionary(p => p.Key, p => p.Value); tmp = tmp.Where(p => p.Value.Any(Char.IsLetter)).ToDictionary(p => p.Key, p => rgx.Replace(p.Value, "")); Dictionary <int, string> cleanTokens = tmp.Where(p => !p.Value.All(Char.IsUpper)).ToDictionary(p => p.Key, p => p.Value); // Find Suggestion: if (cbMethod.SelectedItem.ToString().EndsWith("Hunspell")) { string hunspellLog = ""; // find Suggestion using Hunspell: foreach (KeyValuePair <int, string> err in cleanTokens) { string errInNewSpell = correction.ChangeOldToNewSpell(err.Value).ToLowerInvariant(); List <string> hunspellSuggestions = new List <string>(); using (SpellEngine engine = new SpellEngine()) { LanguageConfig idConfig = new LanguageConfig(); idConfig.LanguageCode = "id"; idConfig.HunspellAffFile = "id_ID.aff"; idConfig.HunspellDictFile = "id_ID.dic"; idConfig.HunspellKey = ""; engine.AddLanguage(idConfig); bool correct = engine["id"].Spell(errInNewSpell); if (!correct) { hunspellSuggestions = engine["id"].Suggest(errInNewSpell); if (hunspellSuggestions.Count > 0 && err.Value != correction.ChangeNewToOldSpell(hunspellSuggestions[0])) { deHyphenateTokens[err.Key] = "[" + correction.ChangeNewToOldSpell(hunspellSuggestions[0]) + "]"; } // for analysis: if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> { { getFieldNameFromOption(), rgx.Replace(deHyphenateTokens[err.Key], "") }, { getFieldNameFromOption().Replace("Correction", "Log"), hunspellLog } }); } } else { // for analysis: if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> { { getFieldNameFromOption(), err.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), err.Value + " is correct" } }); } } } } ResetControls(true); return; } //check only unique word (assumption:duplicate word is correct word) : Dictionary <int, string> checkTokens = cleanTokens; var duplicateValues = checkTokens.GroupBy(x => x.Value).Where(x => x.Count() > 1); List <int> duplicateKeys = new List <int>(); foreach (var item in checkTokens) { foreach (var dup in duplicateValues) { if (item.Value == dup.Key) { duplicateKeys.Add(item.Key); } } } foreach (var dupkey in duplicateKeys) { // for analysis if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, dupkey, new Dictionary <string, string> { { "NCorrection", checkTokens[dupkey] }, { "NLog", "Duplicate" }, { "Correction", checkTokens[dupkey] }, { "Log", "Duplicate" }, { "WOSearchCorrection", checkTokens[dupkey] }, { "WOSearchLog", "Duplicate" }, { "WOStemCorrection", checkTokens[dupkey] }, { "WOStemLog", "Duplicate" }, { "WOStemSearchCorrection", checkTokens[dupkey] }, { "WOStemSearchLog", "Duplicate" }, { "GooglePureCorrection", checkTokens[dupkey] }, { "GooglePureLog", "Duplicate" } }); } checkTokens.Remove(dupkey); } //Check Word using Dictionary(kbbi+kompas pilihan, entitas kota,negara, nama pahlawan dari wiki ): errors = new Dictionary <int, string>(); foreach (KeyValuePair <int, string> token in checkTokens) { // change Soewandi to Modern Spelling: string wordToCheck = correction.ChangeOldToNewSpell(token.Value).ToLowerInvariant(); // check word in Dictionary and Add to Error list if not there: int frequency; if (!correction.CheckUnigram(wordToCheck, getSQLQueryToCheckUnigram(), out frequency)) { if (cbMethod.SelectedItem.ToString().Contains("Stemmer")) { // check again its stem in dictionary : string stem = stemmer.Stemming(wordToCheck); if (wordToCheck != stem && stemmer.checkStem(stem)) { // for analysis if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> { { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), stem + " is word" } }); } } else // jika tidak ada di kamus: { errors.Add(token.Key, wordToCheck); } } else { errors.Add(token.Key, wordToCheck); } } else // jika ada di kamus: { // for analysis if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, token.Key, new Dictionary <string, string> { { getFieldNameFromOption(), token.Value }, { getFieldNameFromOption().Replace("Correction", "Log"), wordToCheck + " is correct" } }); } } } // Find Suggestion: if (cbMethod.SelectedItem.ToString().EndsWith("Google")) { timerGoogle.Enabled = true; indexTimerGoogle = 0; return; } else { foreach (KeyValuePair <int, string> err in errors) { //get suggestion: string log; string suggestion; suggestion = correction.GetSuggestion(getSPNameForGetCandidates(), deHyphenateTokens, err.Key, err.Value, out log, getWithStemAndAffixCorrParamFromOption(), getWithSearchParamFromOption()); // Change suggestion back to Old Spell if any suggestions: if (log != "No candidates") { suggestion = correction.ChangeNewToOldSpell(suggestion); } // update token dic with suggestion: if (!suggestion.Equals(deHyphenateTokens[err.Key], StringComparison.OrdinalIgnoreCase)) { deHyphenateTokens[err.Key] = "[" + suggestion + "]"; } // for analysis: if (!string.IsNullOrEmpty(articleFileGT)) { correction.UpdateFields(articleFileName, err.Key, new Dictionary <string, string> { { getFieldNameFromOption(), suggestion }, { getFieldNameFromOption().Replace("Correction", "Log"), log } }); } } ResetControls(true); } }
public List <CorrectionCandidate> GetCandidatesFromAffixCorrection(int key, string error, out string log) { //[prefix-1] + [prefix-2] + root + [suffix] + [possessive] + [particle] //1.Particles: -lah, -kah, -pun, -tah. //2.Possessives: -ku, -mu, -nya. //3.Suffixes: -i, -an, -kan. //4.Prefixes: meN -, beN -, peN -, teN -, di -, ke -, se -. log = ""; List <string> Prefixs = new List <string>() { "", "di", "ke", "se", "ber", "bel", "be", "te", "ter", "me", "mem", "men", "meng", " menge", "meny", "pe", "per", "pem", "pen", "peng", "penge", "peny", "pel", "memper", "mempel", "menter", "member", "diper", "diter", "dipel", " diber", "keber", "keter" }; List <string> baseSuffixes = new List <string>() { "i", "an", "kan" }; List <string> possessives = new List <string>() { "ku", "mu", "nya" }; List <string> particles = new List <string>() { "lah", "kah", "pun", "tah" }; //List<string> akhirans = new List<string>() { "", "i", "an", "kan", "ku", "mu", "nja", "lah", "kah", "pun", "tah", "iku", "imu", "inja", "anku", "anmu", "annja", "kanku", "kanmu", "kannja", "ilah", "ikah", "ipun", "itah", "anlah", "ankah", "anpun", "antah", "kanlah", "kankah", "kanpun", "kantah", "kulah", "kukah", "kupun", "kutah", " mulah", "mukah", "mupun", "mutah", " nyalah", "nyakah", "nyapun", "nyatah", "ikulah", "ikukah", "ikupun", "ikutah", "imulah", "imukah", "imupun", "imutah", "inyalah", "inyakah", "inyapun", "inyatah", "ankulah", "ankukah", "ankupun", "ankutah", "anmulah", "anmukah", "anmupun", "anmutah", "annyalah", "annyakah", "annyapun", "annyatah", "kankulah", "kankukah", "kankupun", "kankutah", "kanmulah", "kanmukah", "kanmupun", "kanmutah", "kannjalah", "kannjakah", "kannjapun", "kannjatah" }; List <string> suffixes = new List <string>() { "" }; suffixes.AddRange(baseSuffixes); suffixes.AddRange(possessives); suffixes.AddRange(particles); foreach (string s in baseSuffixes) { foreach (string po in possessives) { suffixes.Add(s + po); // contoh: diperbaikinya foreach (string pa in particles) { suffixes.Add(s + pa); //contoh:dipelukanmulah } } } foreach (string s in baseSuffixes) { foreach (string pa in particles) { suffixes.Add(s + pa); // contoh: pertahankanlah } } List <CorrectionCandidate> candidates = new List <CorrectionCandidate>(); Dictionary <string, int> dicCandidates = new Dictionary <string, int>(); Correction correct = new Correction(); string rootWord = GetRootWord(correct.ChangeOldToNewSpell(error)); if (rootWord == "" || rootWord.Length < 3) { return(candidates); } foreach (string prefix in Prefixs) { foreach (string suffix in suffixes) { Affixer affixer = new Affixer(); string candidate = correct.ChangeNewToOldSpell(affixer.Affixing(correct.ChangeNewToOldSpell(rootWord), prefix, suffix)); int levenshtein = EditDistance.LevenshteinDistance(candidate, error, 2); if (levenshtein != -1 && levenshtein <= 2) { if (!dicCandidates.ContainsKey(correct.ChangeOldToNewSpell(candidate))) { dicCandidates.Add(correct.ChangeOldToNewSpell(candidate), levenshtein); } } } } if (dicCandidates.Count == 0) { return(candidates); } Dictionary <string, int> dicCandidateAndFreq = GetFrequencies(dicCandidates.Keys.ToArray()); foreach (KeyValuePair <string, int> can in dicCandidates) { int frequency = 0; if (dicCandidateAndFreq.ContainsKey(can.Key)) { frequency = dicCandidateAndFreq[can.Key]; } CorrectionCandidate corrcandidate = new CorrectionCandidate { Key = key, Error = error, Candidate = can.Key, SameBigramAmount = -1, Frequency = frequency, LengthDifference = Math.Abs(can.Key.Length - correct.ChangeOldToNewSpell(error).Length), Levensthein = can.Value }; candidates.Add(corrcandidate); log += can.Key + "," + can.Value + "," + frequency.ToString() + ";"; } if (log.Length > 0) { log = "[" + log + "]"; } return(candidates); }