예제 #1
0
        /// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary>
        /// <param name="sos">- all the candidates of morphological analysis
        /// </param>
        /// <returns> the final morphological analysis result which has the highest probability
        /// </returns>
        private Sentence end_sentence(SetOfSentences sos)
        {
            int i, j, k;

            /* Ceartes the last node */
            i           = new_wp(" ");
            wp[i].MNode = new_mnode(null, "SF", 0);

            /* Runs viterbi */
            for (i = 1; i < wp_end - 1; i++)
            {
                for (j = wp[i].MNode; j != 0; j = mn[j].Sibling)
                {
                    for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling)
                    {
                        update_prob_score(j, k);
                    }
                }
            }

            i = sos.length;
            Eojeol[] eojeols = new Eojeol[i];
            for (k = wp[i].MNode; k != 0; k = mn[k].Backptr)
            {
                eojeols[--i] = mn[k].Eojeol;
            }

            return(new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols));
        }
예제 #2
0
 /// <summary> Adds a new node for the markov model.</summary>
 /// <param name="eojeol">- the eojeol to add
 /// </param>
 /// <param name="wp_tag">- the eojeol tag
 /// </param>
 /// <param name="prob">- the probability P(w|t)
 /// </param>
 /// <returns> the index of the new node
 /// </returns>
 private int new_mnode(Eojeol eojeol, System.String wp_tag, double prob)
 {
     mn[mn_end].Eojeol  = eojeol;
     mn[mn_end].Wp_Tag  = wp_tag;
     mn[mn_end].Prob_Wt = prob;
     mn[mn_end].Backptr = 0;
     mn[mn_end].Sibling = 0;
     return(mn_end++);
 }
예제 #3
0
        /// <summary> It changes the POS tagging result with 69 KAIST tags to the simplified result with 9 tags.</summary>
        /// <param name="st">- the result of morphological analysis where each eojeol has more than analysis result
        /// </param>
        /// <returns> the simplified POS tagging result
        /// </returns>
        public virtual Sentence doProcess(Sentence st)
        {
            System.String prevTag = null;
            bool          changed = false;

            Eojeol[] eojeolSet = st.Eojeols;

            for (int i = 0; i < eojeolSet.Length; i++)
            {
                System.String[] tags = eojeolSet[i].Tags;
                prevTag = "";
                changed = false;

                for (int j = 0; j < tags.Length; j++)
                {
                    tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL);

                    if (tags[j].Equals(prevTag))
                    {
                        changed = true;
                    }
                    prevTag = tags[j];
                }

                if (changed)
                {
                    tagList.Clear();
                    morphemeList.Clear();
                    System.String[] morphemes = eojeolSet[i].Morphemes;

                    for (int j = 0; j < tags.Length - 1; j++)
                    {
                        if (tags[j].Equals(tags[j + 1]))
                        {
                            morphemes[j + 1] = morphemes[j] + morphemes[j + 1];
                        }
                        else
                        {
                            tagList.Add(tags[j]);
                            morphemeList.Add(morphemes[j]);
                        }
                    }
                    tagList.Add(tags[tags.Length - 1]);
                    morphemeList.Add(morphemes[morphemes.Length - 1]);

                    eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray());
                }
            }
            st.Eojeols = eojeolSet;

            return(st);
        }
예제 #4
0
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray();

            LinkedList <Eojeol> eojeolArray = new LinkedList <Eojeol>();

            for (int i = 0; i < eojeolSetArray.Count; i++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[i];

                eojeolArray.Clear();
                for (int j = 0; j < eojeolSet.Length; j++)
                {
                    eojeolArray.AddLast(eojeolSet[j]);
                }

                int unkCount = 0;
                for (int j = 0; j < eojeolArray.Count; j++)
                {
                    Eojeol          eojeol    = eojeolArray.Get_Renamed(j);
                    System.String[] tags      = eojeol.Tags;
                    System.String[] morphemes = eojeol.Morphemes;

                    for (int k = 0; k < tags.Length; k++)
                    {
                        if (tags[k].Equals("unk"))
                        {
                            tags[k] = "nqq";

                            Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]);
                            eojeolArray.AddLast(newEojeol);

                            tags[k] = "ncn";
                            unkCount++;
                        }
                    }
                }

                if (unkCount > 0)
                {
                    eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet);
                }
            }

            return(sos);
        }
예제 #5
0
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray();

            LinkedList < Eojeol > eojeolArray = new LinkedList < Eojeol >();

            for (int i = 0; i < eojeolSetArray.Count; i++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[i];

                eojeolArray.Clear();
                for (int j = 0; j < eojeolSet.Length; j++)
                {
                    eojeolArray.AddLast(eojeolSet[j]);
                }

                int unkCount = 0;
                for (int j = 0; j < eojeolArray.Count; j++)
                {
                    Eojeol eojeol = eojeolArray.Get_Renamed(j);
                    System.String[] tags = eojeol.Tags;
                    System.String[] morphemes = eojeol.Morphemes;

                    for (int k = 0; k < tags.Length; k++)
                    {
                        if (tags[k].Equals("unk"))
                        {
                            tags[k] = "nqq";

                            Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]);
                            eojeolArray.AddLast(newEojeol);

                            tags[k] = "ncn";
                            unkCount++;
                        }
                    }
                }

                if (unkCount > 0)
                {
                    eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet);
                }
            }

            return sos;
        }
예제 #6
0
        /// <summary> It processes the input plain eojeol by analyzing it or searching the pre-analyzed dictionary.</summary>
        /// <param name="plainEojeol">- plain eojeol to analyze
        /// </param>
        /// <returns> the morphologically analyzed eojeol list
        /// </returns>
        private Eojeol[] processEojeol(System.String plainEojeol)
        {
            System.String analysis = analyzedDic.get_Renamed(plainEojeol);

            eojeolList.Clear();

            if (analysis != null)
            {
                // the eojeol was registered in the pre-analyzed dictionary
                StringTokenizer st = new StringTokenizer(analysis, "^");
                while (st.HasMoreTokens)
                {
                    System.String   analyzed = st.NextToken;
                    System.String[] tokens   = analyzed.Split("\\+|/");

                    System.String[] morphemes = new System.String[tokens.Length / 2];
                    System.String[] tags      = new System.String[tokens.Length / 2];

                    for (int i = 0, j = 0; i < morphemes.Length; i++)
                    {
                        morphemes[i] = tokens[j++];
                        tags[i]      = tokens[j++];
                    }
                    Eojeol eojeol = new Eojeol(morphemes, tags);
                    eojeolList.AddLast(eojeol);
                }
            }
            else
            {
                // analyze the input plain eojeol
                chart.init(plainEojeol);
                chart.analyze();
                chart.getResult();
            }

            return(eojeolList.ToArray());
        }
예제 #7
0
        /// <summary> It changes the POS tagging result with 69 KAIST tags to the simplified result with 9 tags.</summary>
        /// <param name="st">- the result of morphological analysis where each eojeol has more than analysis result
        /// </param>
        /// <returns> the simplified POS tagging result
        /// </returns>
        public virtual Sentence doProcess(Sentence st)
        {
            System.String prevTag = null;
            bool changed = false;

            Eojeol[] eojeolSet = st.Eojeols;

            for (int i = 0; i < eojeolSet.Length; i++)
            {
                System.String[] tags = eojeolSet[i].Tags;
                prevTag = "";
                changed = false;

                for (int j = 0; j < tags.Length; j++)
                {
                    tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL);

                    if (tags[j].Equals(prevTag))
                    {
                        changed = true;
                    }
                    prevTag = tags[j];
                }

                if (changed)
                {
                    tagList.Clear();
                    morphemeList.Clear();
                    System.String[] morphemes = eojeolSet[i].Morphemes;

                    for (int j = 0; j < tags.Length - 1; j++)
                    {
                        if (tags[j].Equals(tags[j + 1]))
                        {
                            morphemes[j + 1] = morphemes[j] + morphemes[j + 1];
                        }
                        else
                        {
                            tagList.Add(tags[j]);
                            morphemeList.Add(morphemes[j]);
                        }
                    }
                    tagList.Add(tags[tags.Length - 1]);
                    morphemeList.Add(morphemes[morphemes.Length - 1]);

                    eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray());
                }
            }
            st.Eojeols = eojeolSet;

            return st;
        }
예제 #8
0
        /// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary>
        /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result
        /// </param>
        /// <returns> the simplified morphological analysis result
        /// </returns>
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray();
            List< Eojeol [] > resultSetArray = new List< Eojeol [] >();

            int len = eojeolSetArray.Count;
            System.String prevTag = null;
            bool changed = false;

            for (int pos = 0; pos < len; pos++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[pos];
                dupFilterMap.Clear();

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    System.String[] tags = eojeolSet[i].Tags;
                    prevTag = "";
                    changed = false;

                    for (int j = 0; j < tags.Length; j++)
                    {
                        tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL);

                        if (tags[j].Equals(prevTag))
                        {
                            changed = true;
                        }
                        prevTag = tags[j];
                    }

                    if (changed)
                    {
                        tagList.Clear();
                        morphemeList.Clear();
                        System.String[] morphemes = eojeolSet[i].Morphemes;

                        for (int j = 0; j < tags.Length - 1; j++)
                        {
                            if (tags[j].Equals(tags[j + 1]))
                            {
                                morphemes[j + 1] = morphemes[j] + morphemes[j + 1];
                            }
                            else
                            {
                                tagList.Add(tags[j]);
                                morphemeList.Add(morphemes[j]);
                            }
                        }
                        tagList.Add(tags[tags.Length - 1]);
                        morphemeList.Add(morphemes[morphemes.Length - 1]);

                        eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray());
                    }

                    System.String key = eojeolSet[i].ToString();
                    if (!dupFilterMap.ContainsKey(key))
                    {
                        dupFilterMap[key] = eojeolSet[i];
                    }
                }
                if (eojeolSet.Length != dupFilterMap.Count)
                {
                    resultSetArray.Add(dupFilterMap.Values.ToArray());
                }
                else
                {
                    resultSetArray.Add(eojeolSet);
                }
            }

            sos.setEojeolSetArray(resultSetArray);
            return sos;
        }
예제 #9
0
        /// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary>
        /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result
        /// </param>
        /// <returns> the simplified morphological analysis result
        /// </returns>
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray();
            List <Eojeol []> resultSetArray = new List <Eojeol []>();

            int len = eojeolSetArray.Count;

            System.String prevTag = null;
            bool          changed = false;

            for (int pos = 0; pos < len; pos++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[pos];
                dupFilterMap.Clear();

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    System.String[] tags = eojeolSet[i].Tags;
                    prevTag = "";
                    changed = false;

                    for (int j = 0; j < tags.Length; j++)
                    {
                        tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL);

                        if (tags[j].Equals(prevTag))
                        {
                            changed = true;
                        }
                        prevTag = tags[j];
                    }

                    if (changed)
                    {
                        tagList.Clear();
                        morphemeList.Clear();
                        System.String[] morphemes = eojeolSet[i].Morphemes;

                        for (int j = 0; j < tags.Length - 1; j++)
                        {
                            if (tags[j].Equals(tags[j + 1]))
                            {
                                morphemes[j + 1] = morphemes[j] + morphemes[j + 1];
                            }
                            else
                            {
                                tagList.Add(tags[j]);
                                morphemeList.Add(morphemes[j]);
                            }
                        }
                        tagList.Add(tags[tags.Length - 1]);
                        morphemeList.Add(morphemes[morphemes.Length - 1]);

                        eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray());
                    }

                    System.String key = eojeolSet[i].ToString();
                    if (!dupFilterMap.ContainsKey(key))
                    {
                        dupFilterMap[key] = eojeolSet[i];
                    }
                }
                if (eojeolSet.Length != dupFilterMap.Count)
                {
                    resultSetArray.Add(dupFilterMap.Values.ToArray());
                }
                else
                {
                    resultSetArray.Add(eojeolSet);
                }
            }

            sos.setEojeolSetArray(resultSetArray);
            return(sos);
        }
예제 #10
0
        /// <summary> It processes the input plain eojeol by analyzing it or searching the pre-analyzed dictionary.</summary>
        /// <param name="plainEojeol">- plain eojeol to analyze
        /// </param>
        /// <returns> the morphologically analyzed eojeol list
        /// </returns>
        private Eojeol[] processEojeol(System.String plainEojeol)
        {
            System.String analysis = analyzedDic.get_Renamed(plainEojeol);

            eojeolList.Clear();

            if (analysis != null)
            {
                // the eojeol was registered in the pre-analyzed dictionary
                StringTokenizer st = new StringTokenizer(analysis, "^");
                while (st.HasMoreTokens)
                {
                    System.String analyzed = st.NextToken;
                    System.String[] tokens = analyzed.Split("\\+|/");

                    System.String[] morphemes = new System.String[tokens.Length / 2];
                    System.String[] tags = new System.String[tokens.Length / 2];

                    for (int i = 0, j = 0; i < morphemes.Length; i++)
                    {
                        morphemes[i] = tokens[j++];
                        tags[i] = tokens[j++];
                    }
                    Eojeol eojeol = new Eojeol(morphemes, tags);
                    eojeolList.AddLast(eojeol);
                }
            }
            else
            {
                // analyze the input plain eojeol
                chart.init(plainEojeol);
                chart.analyze();
                chart.getResult();
            }

            return eojeolList.ToArray();
        }
예제 #11
0
 /// <summary> Adds a new node for the markov model.</summary>
 /// <param name="eojeol">- the eojeol to add
 /// </param>
 /// <param name="wp_tag">- the eojeol tag
 /// </param>
 /// <param name="prob">- the probability P(w|t)
 /// </param>
 /// <returns> the index of the new node
 /// </returns>
 private int new_mnode(Eojeol eojeol, System.String wp_tag, double prob)
 {
     mn[mn_end].Eojeol = eojeol;
     mn[mn_end].Wp_Tag = wp_tag;
     mn[mn_end].Prob_Wt = prob;
     mn[mn_end].Backptr = 0;
     mn[mn_end].Sibling = 0;
     return mn_end++;
 }
예제 #12
0
        /// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary>
        /// <param name="sos">- all the candidates of morphological analysis
        /// </param>
        /// <returns> the final morphological analysis result which has the highest probability
        /// </returns>
        private Sentence end_sentence(SetOfSentences sos)
        {
            int i, j, k;

            /* Ceartes the last node */
            i = new_wp(" ");
            wp[i].MNode = new_mnode(null, "SF", 0);

            /* Runs viterbi */
            for (i = 1; i < wp_end - 1; i++)
            {
                for (j = wp[i].MNode; j != 0; j = mn[j].Sibling)
                {
                    for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling)
                    {
                        update_prob_score(j, k);
                    }
                }
            }

            i = sos.length;
            Eojeol[] eojeols = new Eojeol[i];
            for (k = wp[i].MNode; k != 0; k = mn[k].Backptr)
            {
                eojeols[--i] = mn[k].Eojeol;
            }

            return new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols);
        }
예제 #13
0
        /// <summary> Computes P(T_i, W_i) of the specified eojeol.</summary>
        /// <param name="eojeol">- the eojeol to compute the probability
        /// </param>
        /// <returns> P(T_i, W_i) of the specified eojeol
        /// </returns>
        private double compute_wt(Eojeol eojeol)
        {
            double current = 0.0, tbigram, tunigram, lexicon;

            System.String tag;
            System.String bitag;
            System.String oldtag;

            tag = eojeol.getTag(0);

            /* the probability of P(t1|t0) */
            bitag = "bnk-" + tag;

            double[] prob = null;

            if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null)
            {
                /* current = P(t1|t0) */
                tbigram = prob[0];
            }
            else
            {
                /* current = P(t1|t0) = 0.01 */
                tbigram = PCONSTANT;
            }

            /* the probability of P(t1) */
            if ((prob = ptt_pos_tf.get_Renamed(tag)) != null)
            {
                /* current = P(t1) */
                tunigram = prob[0];
            }
            else
            {
                /* current = P(t1) = 0.01 */
                tunigram = PCONSTANT;
            }

            /* the probability of P(w|t) */
            if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(0) + "/" + tag)) != null)
            {
                /* current *= P(w|t1) */
                lexicon = prob[0];
            }
            else
            {
                /* current = P(w|t1) = 0.01 */
                lexicon = PCONSTANT;
            }

            /*
            * current = P(w|t1) * P(t1|t0) ~= P(w|t1) * (P(t1|t0))^Lambda1 * (P(t1))^Lambda2 (Lambda1 + Lambda2 = 1)
            */
            //		current = lexicon + Lambda1*tbigram + Lambda2*tunigram;

            /*
            * current = P(w|t1)/P(t1) * P(t1|t0)/P(t1)
            */
            //		current = lexicon - tunigram + tbigram - tunigram;

            /*
            * current = P(w|t1) * P(t1|t0)
            */
            //		current = lexicon + tbigram ;

            /*
            * current = P(w|t1) * P(t1|t0) / P(t1)
            */
            current = lexicon + tbigram - tunigram;
            oldtag = tag;

            for (int i = 1; i < eojeol.length; i++)
            {
                tag = eojeol.getTag(i);

                /* P(t_i|t_i-1) */
                bitag = oldtag + "-" + tag;

                if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null)
                {
                    tbigram = prob[0];
                }
                else
                {
                    tbigram = PCONSTANT;
                }

                /* P(w|t) */
                if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(i) + "/" + tag)) != null)
                {
                    /* current *= P(w|t) */
                    lexicon = prob[0];
                }
                else
                {
                    lexicon = PCONSTANT;
                }

                /* P(t) */
                if ((prob = ptt_pos_tf.get_Renamed(tag)) != null)
                {
                    /* current = P(t) */
                    tunigram = prob[0];
                }
                else
                {
                    /* current = P(t)=0.01 */
                    tunigram = PCONSTANT;
                }

                //			current += lexicon - tunigram + tbigram - tunigram;
                //			current += lexicon + tbigram;
                current += lexicon + tbigram - tunigram;

                oldtag = tag;
            }

            /* the blank at the end of eojeol */
            bitag = tag + "-bnk";

            /* P(bnk|t_last) */
            if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null)
            {
                tbigram = prob[0];
            }
            else
            {
                tbigram = PCONSTANT;
            }

            /* P(bnk) */
            if ((prob = ptt_pos_tf.get_Renamed("bnk")) != null)
            {
                tunigram = prob[0];
            }
            else
            {
                tunigram = PCONSTANT;
            }

            /* P(w|bnk) = 1, and ln(1) = 0 */
            //		current += 0 - tunigram + tbigram - tunigram;
            //		current += 0 + tbigram;
            current += 0 + tbigram - tunigram;

            return current;
        }
예제 #14
0
        /// <summary> Computes P(T_i, W_i) of the specified eojeol.</summary>
        /// <param name="eojeol">- the eojeol to compute the probability
        /// </param>
        /// <returns> P(T_i, W_i) of the specified eojeol
        /// </returns>
        private double compute_wt(Eojeol eojeol)
        {
            double current = 0.0, tbigram, tunigram, lexicon;

            System.String tag;
            System.String bitag;
            System.String oldtag;

            tag = eojeol.getTag(0);

            /* the probability of P(t1|t0) */
            bitag = "bnk-" + tag;

            double[] prob = null;

            if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null)
            {
                /* current = P(t1|t0) */
                tbigram = prob[0];
            }
            else
            {
                /* current = P(t1|t0) = 0.01 */
                tbigram = PCONSTANT;
            }

            /* the probability of P(t1) */
            if ((prob = ptt_pos_tf.get_Renamed(tag)) != null)
            {
                /* current = P(t1) */
                tunigram = prob[0];
            }
            else
            {
                /* current = P(t1) = 0.01 */
                tunigram = PCONSTANT;
            }

            /* the probability of P(w|t) */
            if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(0) + "/" + tag)) != null)
            {
                /* current *= P(w|t1) */
                lexicon = prob[0];
            }
            else
            {
                /* current = P(w|t1) = 0.01 */
                lexicon = PCONSTANT;
            }

            /*
             * current = P(w|t1) * P(t1|t0) ~= P(w|t1) * (P(t1|t0))^Lambda1 * (P(t1))^Lambda2 (Lambda1 + Lambda2 = 1)
             */
            //		current = lexicon + Lambda1*tbigram + Lambda2*tunigram;

            /*
             * current = P(w|t1)/P(t1) * P(t1|t0)/P(t1)
             */
            //		current = lexicon - tunigram + tbigram - tunigram;

            /*
             * current = P(w|t1) * P(t1|t0)
             */
            //		current = lexicon + tbigram ;

            /*
             * current = P(w|t1) * P(t1|t0) / P(t1)
             */
            current = lexicon + tbigram - tunigram;
            oldtag  = tag;


            for (int i = 1; i < eojeol.length; i++)
            {
                tag = eojeol.getTag(i);

                /* P(t_i|t_i-1) */
                bitag = oldtag + "-" + tag;

                if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null)
                {
                    tbigram = prob[0];
                }
                else
                {
                    tbigram = PCONSTANT;
                }

                /* P(w|t) */
                if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(i) + "/" + tag)) != null)
                {
                    /* current *= P(w|t) */
                    lexicon = prob[0];
                }
                else
                {
                    lexicon = PCONSTANT;
                }

                /* P(t) */
                if ((prob = ptt_pos_tf.get_Renamed(tag)) != null)
                {
                    /* current = P(t) */
                    tunigram = prob[0];
                }
                else
                {
                    /* current = P(t)=0.01 */
                    tunigram = PCONSTANT;
                }

                //			current += lexicon - tunigram + tbigram - tunigram;
                //			current += lexicon + tbigram;
                current += lexicon + tbigram - tunigram;

                oldtag = tag;
            }

            /* the blank at the end of eojeol */
            bitag = tag + "-bnk";

            /* P(bnk|t_last) */
            if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null)
            {
                tbigram = prob[0];
            }
            else
            {
                tbigram = PCONSTANT;
            }

            /* P(bnk) */
            if ((prob = ptt_pos_tf.get_Renamed("bnk")) != null)
            {
                tunigram = prob[0];
            }
            else
            {
                tunigram = PCONSTANT;
            }

            /* P(w|bnk) = 1, and ln(1) = 0 */
            //		current += 0 - tunigram + tbigram - tunigram;
            //		current += 0 + tbigram;
            current += 0 + tbigram - tunigram;

            return(current);
        }
예제 #15
0
        /// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary>
        /// <param name="sos">- the result of morphological analysis
        /// </param>
        /// <returns> the result of morphological analysis with post processing
        /// </returns>
        public virtual SetOfSentences doPostProcessing(SetOfSentences sos)
        {
            List <Eojeol[]> eojeolSetArray = sos.getEojeolSetArray();

            IEnumerator <Eojeol[]> iter = eojeolSetArray.GetEnumerator();

            while (iter.MoveNext())
            {
                Eojeol[]      eojeolSet = iter.Current;
                System.String prevMorph = "";

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    Eojeol          eojeol    = eojeolSet[i];
                    System.String[] morphemes = eojeol.Morphemes;
                    System.String[] tags      = eojeol.Tags;

                    for (int j = 0; j < eojeol.length; j++)
                    {
                        System.String tri = Code.toTripleString(morphemes[j]);
                        if (tags[j].StartsWith("e"))
                        {
                            int prevLen = prevMorph.Length;

                            if (tri.StartsWith(A_))
                            {
                                /* 어 -> 아 */
                                if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4]))))
                                {
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                                else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false))
                                {
                                    /* for 'ㅂ' irregular */
                                }
                                else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA))
                                {
                                }
                                else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2]))))
                                {
                                    // final consonant or not
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                            }
                            else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0))))
                            {
                                /* elision of '으', '스', '느' */
                                if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF))
                                {
                                    morphemes[j] = Code.toString(tri.Substring(2).ToCharArray());
                                }
                            }
                        }

                        prevMorph = Code.toTripleString(morphemes[j]);
                    }
                }
            }

            return(sos);
        }