コード例 #1
0
        public void NullOrEmptyStringsDistanceCheck()
        {
            string s1 = null;
            string s2 = "test";

            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == s2.Length);
            var temp = s1;

            s1 = s2;
            s2 = temp;
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == s1.Length);

            s2 = string.Empty;
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == s1.Length);
            temp = s1;
            s1   = s2;
            s2   = temp;
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == s2.Length);

            s1 = null;
            s2 = string.Empty;
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == 0);
            temp = s1;
            s1   = s2;
            s2   = temp;
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == 0);
        }
コード例 #2
0
ファイル: Correction.cs プロジェクト: pakdanan/OCRCorrection
        public int GoogleSearch(string phrase, string error, out string suggestion)
        {
            suggestion = "";
            int    hits     = 0;
            string url      = string.Format("https://www.google.co.id/search?q=\"{0}\"", phrase.Trim());
            Random r        = new Random();
            int    interval = r.Next(5000, 9000);

            Thread.Sleep(interval);
            using (ChromeDriver driver = new ChromeDriver(SeleniumFolder))
            {
                driver.Navigate().GoToUrl(url);
                try
                {
                    int.TryParse(Regex.Replace(driver.FindElement(By.XPath("//div[@id=\"resultStats\"]")).Text, "[^0-9]", ""), out hits);
                }
                catch { }
                try
                {
                    IWebElement noresultNode = driver.FindElement(By.XPath("//div[@id=\"topstuff\"]"));
                    if (noresultNode.Text.StartsWith("Hasil untuk") && noresultNode.Text.Contains("tidak ditemukan"))
                    {
                        hits = 0;
                    }
                }
                catch { }
                try
                {
                    IWebElement spellNode = driver.FindElement(By.XPath("//a[@class=\"spell\"]"));
                    var         tmpA      = spellNode.GetAttribute("innerHTML").Replace(""", "");
                    string[]    arrTmp    = Regex.Split(tmpA, "<b><i>");
                    Dictionary <string, int> candidates = new Dictionary <string, int>();
                    foreach (string tmp in arrTmp)
                    {
                        if (tmp.Trim().Contains("</i></b>"))
                        {
                            int    posAkhir    = tmp.Trim().IndexOf("</i></b>");
                            string nline       = tmp.Trim().Substring(0, posAkhir);
                            int    levenshtein = EditDistance.LevenshteinDistance(nline, error, 4);
                            if (levenshtein != -1 && levenshtein <= 4)
                            {
                                candidates.Add(nline, levenshtein);
                            }
                        }
                    }
                    if (candidates.Count > 1)
                    {
                        var temp = candidates.OrderBy(p => p.Value).ToDictionary(p => p.Key, p => p.Value);
                        suggestion = temp.First().Key;
                    }
                    if (candidates.Count == 1)
                    {
                        suggestion = candidates.First().Key;
                    }
                }
                catch { }
            }

            return(hits);
        }
コード例 #3
0
        public void ReversedStringsDistanceCheck()
        {
            string s1 = "asdfghjkl";
            string s2 = "lkjhgfdsa";

            // Expected length - 1, because one char is on the same place (odd number of chars)
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == s1.Length - 1);
            s1 = "qazwsxedcrfv";
            s2 = "vfrcdexswzaq";

            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == s1.Length);
        }
コード例 #4
0
        public void DifferentLengthStringsDistanceCheck()
        {
            string s1 = "short string";
            string s2 = "long string, but not so much";

            System.Diagnostics.Trace.WriteLine(EditDistance.LevenshteinDistance(s1, s2));

            // Expected length - 7, because we have 7 consecutive chars that are the same in both strings
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == s2.Length - 7);

            s1 = "another one";
            s2 = "and another one";

            // Expected 4, because 4 chars from the second are not present in the first
            Assert.IsTrue(EditDistance.LevenshteinDistance(s1, s2) == 4);
        }
コード例 #5
0
ファイル: Correction.cs プロジェクト: pakdanan/OCRCorrection
        public List <CorrectionCandidate> GetCandidatesFromAffixCorrection(int key, string error, out string log)
        {
            //[prefix-1] + [prefix-2] + root + [suffix] + [possessive] + [particle]
            //1.Particles: -lah, -kah, -pun, -tah.
            //2.Possessives: -ku, -mu, -nya.
            //3.Suffixes: -i, -an, -kan.
            //4.Prefixes: meN -, beN -, peN -, teN -, di -, ke -, se -.
            log = "";
            List <string> Prefixs = new List <string>()
            {
                "", "di", "ke", "se", "ber", "bel", "be", "te", "ter", "me", "mem", "men", "meng", " menge", "meny", "pe", "per", "pem", "pen", "peng", "penge", "peny", "pel", "memper", "mempel", "menter", "member", "diper", "diter", "dipel", " diber", "keber", "keter"
            };
            List <string> baseSuffixes = new List <string>()
            {
                "i", "an", "kan"
            };
            List <string> possessives = new List <string>()
            {
                "ku", "mu", "nya"
            };
            List <string> particles = new List <string>()
            {
                "lah", "kah", "pun", "tah"
            };
            //List<string> akhirans = new List<string>() { "", "i", "an", "kan", "ku", "mu", "nja", "lah", "kah", "pun", "tah", "iku", "imu", "inja", "anku", "anmu", "annja", "kanku", "kanmu", "kannja", "ilah", "ikah", "ipun", "itah", "anlah", "ankah", "anpun", "antah", "kanlah", "kankah", "kanpun", "kantah", "kulah", "kukah", "kupun", "kutah", " mulah", "mukah", "mupun", "mutah", " nyalah", "nyakah", "nyapun", "nyatah", "ikulah", "ikukah", "ikupun", "ikutah", "imulah", "imukah", "imupun", "imutah", "inyalah", "inyakah", "inyapun", "inyatah", "ankulah", "ankukah", "ankupun", "ankutah", "anmulah", "anmukah", "anmupun", "anmutah", "annyalah", "annyakah", "annyapun", "annyatah", "kankulah", "kankukah", "kankupun", "kankutah", "kanmulah", "kanmukah", "kanmupun", "kanmutah", "kannjalah", "kannjakah", "kannjapun", "kannjatah" };
            List <string> suffixes = new List <string>()
            {
                ""
            };

            suffixes.AddRange(baseSuffixes);
            suffixes.AddRange(possessives);
            suffixes.AddRange(particles);

            foreach (string s in baseSuffixes)
            {
                foreach (string po in possessives)
                {
                    suffixes.Add(s + po);  // contoh: diperbaikinya
                    foreach (string pa in particles)
                    {
                        suffixes.Add(s + pa); //contoh:dipelukanmulah
                    }
                }
            }

            foreach (string s in baseSuffixes)
            {
                foreach (string pa in particles)
                {
                    suffixes.Add(s + pa);   // contoh: pertahankanlah
                }
            }


            List <CorrectionCandidate> candidates    = new List <CorrectionCandidate>();
            Dictionary <string, int>   dicCandidates = new Dictionary <string, int>();
            Correction correct = new Correction();

            string rootWord = GetRootWord(correct.ChangeOldToNewSpell(error));

            if (rootWord == "" || rootWord.Length < 3)
            {
                return(candidates);
            }

            foreach (string prefix in Prefixs)
            {
                foreach (string suffix in suffixes)
                {
                    Affixer affixer     = new Affixer();
                    string  candidate   = correct.ChangeNewToOldSpell(affixer.Affixing(correct.ChangeNewToOldSpell(rootWord), prefix, suffix));
                    int     levenshtein = EditDistance.LevenshteinDistance(candidate, error, 2);
                    if (levenshtein != -1 && levenshtein <= 2)
                    {
                        if (!dicCandidates.ContainsKey(correct.ChangeOldToNewSpell(candidate)))
                        {
                            dicCandidates.Add(correct.ChangeOldToNewSpell(candidate), levenshtein);
                        }
                    }
                }
            }
            if (dicCandidates.Count == 0)
            {
                return(candidates);
            }

            Dictionary <string, int> dicCandidateAndFreq = GetFrequencies(dicCandidates.Keys.ToArray());

            foreach (KeyValuePair <string, int> can in dicCandidates)
            {
                int frequency = 0;
                if (dicCandidateAndFreq.ContainsKey(can.Key))
                {
                    frequency = dicCandidateAndFreq[can.Key];
                }
                CorrectionCandidate corrcandidate = new CorrectionCandidate
                {
                    Key              = key,
                    Error            = error,
                    Candidate        = can.Key,
                    SameBigramAmount = -1,
                    Frequency        = frequency,
                    LengthDifference = Math.Abs(can.Key.Length - correct.ChangeOldToNewSpell(error).Length),
                    Levensthein      = can.Value
                };
                candidates.Add(corrcandidate);
                log += can.Key + "," + can.Value + "," + frequency.ToString() + ";";
            }

            if (log.Length > 0)
            {
                log = "[" + log + "]";
            }
            return(candidates);
        }
コード例 #6
0
ファイル: Correction.cs プロジェクト: pakdanan/OCRCorrection
        public List <CorrectionCandidate> GetCandidates(string spName, int key, string error, string root, string prefix, string suffix, int minSameBigramAmount, int minLengthVariant, int maxLevensthein, out string log)
        {
            // sample: call getCandidates('depat',2,0,1);
            log = "";
            List <CorrectionCandidate> lsCandidates = new List <CorrectionCandidate>();
            MySqlConnection            conn         = new MySqlConnection();

            conn.ConnectionString = MariaDBConn;
            MySqlCommand cmd = new MySqlCommand();

            try
            {
                conn.Open();
                cmd.Connection  = conn;
                cmd.CommandText = spName;
                cmd.CommandType = CommandType.StoredProcedure;
                cmd.Parameters.AddWithValue("@Word", root);
                cmd.Parameters["@Word"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@MinSameBigramAmount", minSameBigramAmount);
                cmd.Parameters["@MinSameBigramAmount"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@MinLengthVariant", minLengthVariant);
                cmd.Parameters["@MinLengthVariant"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@MaxLevensthein", maxLevensthein);
                cmd.Parameters["@MaxLevensthein"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@IsLemma", 1);
                cmd.Parameters["@IsLemma"].Direction = ParameterDirection.Input;
                MySqlDataReader dataReader = cmd.ExecuteReader();

                while (dataReader.Read())
                {
                    string  stem        = dataReader["Unigram"].ToString();
                    Affixer affixer     = new Affixer();
                    string  sCandidate  = affixer.Affixing(stem, prefix, suffix);
                    int     levensthein = EditDistance.LevenshteinDistance(sCandidate, error, 2);
                    if (levensthein != -1 && levensthein <= 2)
                    {
                        CorrectionCandidate candidate = new CorrectionCandidate
                        {
                            Key              = key,
                            Error            = error,
                            Candidate        = sCandidate,
                            SameBigramAmount = Convert.ToInt32(dataReader["SameBigramAmount"]),
                            Frequency        = 0,
                            LengthDifference = Convert.ToInt32(dataReader["LengthDifference"]),
                            Levensthein      = levensthein
                        };
                        lsCandidates.Add(candidate);
                    }
                }
                //close Data Reader
                dataReader.Close();
            }
            catch (MySql.Data.MySqlClient.MySqlException ex)
            {
                throw new Exception(ex.Message);
            }
            conn.Close();

            // Find Frequency then Update to list:
            List <string> sCandidates = new List <string>();;

            foreach (CorrectionCandidate cc in lsCandidates)
            {
                sCandidates.Add(cc.Candidate);
            }
            Dictionary <string, int> dicCandidateAndFreq = GetFrequencies(sCandidates.ToArray());

            foreach (CorrectionCandidate candidate in lsCandidates)
            {
                if (dicCandidateAndFreq.ContainsKey(candidate.Candidate))
                {
                    candidate.Frequency = dicCandidateAndFreq[candidate.Candidate];
                }
                log += candidate.Candidate + "," + candidate.Levensthein + "," + candidate.Frequency.ToString() + ";";
            }

            log = string.Format("[{0},{1},{2}][{3}]", minSameBigramAmount, minLengthVariant, maxLevensthein, log);

            return(lsCandidates);
        }