예제 #1
0
        public List <CorrectionCandidate> GetCandidatesFromAffixCorrection(int key, string error, out string log)
        {
            //[prefix-1] + [prefix-2] + root + [suffix] + [possessive] + [particle]
            //1.Particles: -lah, -kah, -pun, -tah.
            //2.Possessives: -ku, -mu, -nya.
            //3.Suffixes: -i, -an, -kan.
            //4.Prefixes: meN -, beN -, peN -, teN -, di -, ke -, se -.
            log = "";
            List <string> Prefixs = new List <string>()
            {
                "", "di", "ke", "se", "ber", "bel", "be", "te", "ter", "me", "mem", "men", "meng", " menge", "meny", "pe", "per", "pem", "pen", "peng", "penge", "peny", "pel", "memper", "mempel", "menter", "member", "diper", "diter", "dipel", " diber", "keber", "keter"
            };
            List <string> baseSuffixes = new List <string>()
            {
                "i", "an", "kan"
            };
            List <string> possessives = new List <string>()
            {
                "ku", "mu", "nya"
            };
            List <string> particles = new List <string>()
            {
                "lah", "kah", "pun", "tah"
            };
            //List<string> akhirans = new List<string>() { "", "i", "an", "kan", "ku", "mu", "nja", "lah", "kah", "pun", "tah", "iku", "imu", "inja", "anku", "anmu", "annja", "kanku", "kanmu", "kannja", "ilah", "ikah", "ipun", "itah", "anlah", "ankah", "anpun", "antah", "kanlah", "kankah", "kanpun", "kantah", "kulah", "kukah", "kupun", "kutah", " mulah", "mukah", "mupun", "mutah", " nyalah", "nyakah", "nyapun", "nyatah", "ikulah", "ikukah", "ikupun", "ikutah", "imulah", "imukah", "imupun", "imutah", "inyalah", "inyakah", "inyapun", "inyatah", "ankulah", "ankukah", "ankupun", "ankutah", "anmulah", "anmukah", "anmupun", "anmutah", "annyalah", "annyakah", "annyapun", "annyatah", "kankulah", "kankukah", "kankupun", "kankutah", "kanmulah", "kanmukah", "kanmupun", "kanmutah", "kannjalah", "kannjakah", "kannjapun", "kannjatah" };
            List <string> suffixes = new List <string>()
            {
                ""
            };

            suffixes.AddRange(baseSuffixes);
            suffixes.AddRange(possessives);
            suffixes.AddRange(particles);

            foreach (string s in baseSuffixes)
            {
                foreach (string po in possessives)
                {
                    suffixes.Add(s + po);  // contoh: diperbaikinya
                    foreach (string pa in particles)
                    {
                        suffixes.Add(s + pa); //contoh:dipelukanmulah
                    }
                }
            }

            foreach (string s in baseSuffixes)
            {
                foreach (string pa in particles)
                {
                    suffixes.Add(s + pa);   // contoh: pertahankanlah
                }
            }


            List <CorrectionCandidate> candidates    = new List <CorrectionCandidate>();
            Dictionary <string, int>   dicCandidates = new Dictionary <string, int>();
            Correction correct = new Correction();

            string rootWord = GetRootWord(correct.ChangeOldToNewSpell(error));

            if (rootWord == "" || rootWord.Length < 3)
            {
                return(candidates);
            }

            foreach (string prefix in Prefixs)
            {
                foreach (string suffix in suffixes)
                {
                    Affixer affixer     = new Affixer();
                    string  candidate   = correct.ChangeNewToOldSpell(affixer.Affixing(correct.ChangeNewToOldSpell(rootWord), prefix, suffix));
                    int     levenshtein = EditDistance.LevenshteinDistance(candidate, error, 2);
                    if (levenshtein != -1 && levenshtein <= 2)
                    {
                        if (!dicCandidates.ContainsKey(correct.ChangeOldToNewSpell(candidate)))
                        {
                            dicCandidates.Add(correct.ChangeOldToNewSpell(candidate), levenshtein);
                        }
                    }
                }
            }
            if (dicCandidates.Count == 0)
            {
                return(candidates);
            }

            Dictionary <string, int> dicCandidateAndFreq = GetFrequencies(dicCandidates.Keys.ToArray());

            foreach (KeyValuePair <string, int> can in dicCandidates)
            {
                int frequency = 0;
                if (dicCandidateAndFreq.ContainsKey(can.Key))
                {
                    frequency = dicCandidateAndFreq[can.Key];
                }
                CorrectionCandidate corrcandidate = new CorrectionCandidate
                {
                    Key              = key,
                    Error            = error,
                    Candidate        = can.Key,
                    SameBigramAmount = -1,
                    Frequency        = frequency,
                    LengthDifference = Math.Abs(can.Key.Length - correct.ChangeOldToNewSpell(error).Length),
                    Levensthein      = can.Value
                };
                candidates.Add(corrcandidate);
                log += can.Key + "," + can.Value + "," + frequency.ToString() + ";";
            }

            if (log.Length > 0)
            {
                log = "[" + log + "]";
            }
            return(candidates);
        }
예제 #2
0
        public List <CorrectionCandidate> GetCandidates(string spName, int key, string error, string root, string prefix, string suffix, int minSameBigramAmount, int minLengthVariant, int maxLevensthein, out string log)
        {
            // sample: call getCandidates('depat',2,0,1);
            log = "";
            List <CorrectionCandidate> lsCandidates = new List <CorrectionCandidate>();
            MySqlConnection            conn         = new MySqlConnection();

            conn.ConnectionString = MariaDBConn;
            MySqlCommand cmd = new MySqlCommand();

            try
            {
                conn.Open();
                cmd.Connection  = conn;
                cmd.CommandText = spName;
                cmd.CommandType = CommandType.StoredProcedure;
                cmd.Parameters.AddWithValue("@Word", root);
                cmd.Parameters["@Word"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@MinSameBigramAmount", minSameBigramAmount);
                cmd.Parameters["@MinSameBigramAmount"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@MinLengthVariant", minLengthVariant);
                cmd.Parameters["@MinLengthVariant"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@MaxLevensthein", maxLevensthein);
                cmd.Parameters["@MaxLevensthein"].Direction = ParameterDirection.Input;
                cmd.Parameters.AddWithValue("@IsLemma", 1);
                cmd.Parameters["@IsLemma"].Direction = ParameterDirection.Input;
                MySqlDataReader dataReader = cmd.ExecuteReader();

                while (dataReader.Read())
                {
                    string  stem        = dataReader["Unigram"].ToString();
                    Affixer affixer     = new Affixer();
                    string  sCandidate  = affixer.Affixing(stem, prefix, suffix);
                    int     levensthein = EditDistance.LevenshteinDistance(sCandidate, error, 2);
                    if (levensthein != -1 && levensthein <= 2)
                    {
                        CorrectionCandidate candidate = new CorrectionCandidate
                        {
                            Key              = key,
                            Error            = error,
                            Candidate        = sCandidate,
                            SameBigramAmount = Convert.ToInt32(dataReader["SameBigramAmount"]),
                            Frequency        = 0,
                            LengthDifference = Convert.ToInt32(dataReader["LengthDifference"]),
                            Levensthein      = levensthein
                        };
                        lsCandidates.Add(candidate);
                    }
                }
                //close Data Reader
                dataReader.Close();
            }
            catch (MySql.Data.MySqlClient.MySqlException ex)
            {
                throw new Exception(ex.Message);
            }
            conn.Close();

            // Find Frequency then Update to list:
            List <string> sCandidates = new List <string>();;

            foreach (CorrectionCandidate cc in lsCandidates)
            {
                sCandidates.Add(cc.Candidate);
            }
            Dictionary <string, int> dicCandidateAndFreq = GetFrequencies(sCandidates.ToArray());

            foreach (CorrectionCandidate candidate in lsCandidates)
            {
                if (dicCandidateAndFreq.ContainsKey(candidate.Candidate))
                {
                    candidate.Frequency = dicCandidateAndFreq[candidate.Candidate];
                }
                log += candidate.Candidate + "," + candidate.Levensthein + "," + candidate.Frequency.ToString() + ";";
            }

            log = string.Format("[{0},{1},{2}][{3}]", minSameBigramAmount, minLengthVariant, maxLevensthein, log);

            return(lsCandidates);
        }