예제 #1
0
        /// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary>
        /// <param name="sos">- all the candidates of morphological analysis
        /// </param>
        /// <returns> the final morphological analysis result which has the highest probability
        /// </returns>
        private Sentence end_sentence(SetOfSentences sos)
        {
            int i, j, k;

            /* Ceartes the last node */
            i           = new_wp(" ");
            wp[i].MNode = new_mnode(null, "SF", 0);

            /* Runs viterbi */
            for (i = 1; i < wp_end - 1; i++)
            {
                for (j = wp[i].MNode; j != 0; j = mn[j].Sibling)
                {
                    for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling)
                    {
                        update_prob_score(j, k);
                    }
                }
            }

            i = sos.length;
            Eojeol[] eojeols = new Eojeol[i];
            for (k = wp[i].MNode; k != 0; k = mn[k].Backptr)
            {
                eojeols[--i] = mn[k].Eojeol;
            }

            return(new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols));
        }
예제 #2
0
        public virtual Sentence tagPOS(SetOfSentences sos)
        {
            int              v = 0, prev_v = 0, w = 0;
            List <string>    plainEojeolArray = sos.getPlainEojeolArray();
            List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray();

            // initialization
            reset();

            IEnumerator <string> plainEojeolIter = plainEojeolArray.GetEnumerator();

            foreach (Eojeol [] eojeolSet in eojeolSetArray)
            {
                System.String plainEojeol = null;
                if (plainEojeolIter.MoveNext())
                {
                    plainEojeol = plainEojeolIter.Current;
                }
                else
                {
                    break;
                }
                w = new_wp(plainEojeol);

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    System.String now_tag;
                    double        probability;

                    now_tag     = PhraseTag.getPhraseTag(eojeolSet[i].Tags);
                    probability = compute_wt(eojeolSet[i]);

                    v = new_mnode(eojeolSet[i], now_tag, probability);
                    if (i == 0)
                    {
                        wp[w].MNode = v;
                        prev_v      = v;
                    }
                    else
                    {
                        mn[prev_v].Sibling = v;
                        prev_v             = v;
                    }
                }
            }

            // gets the final result by running viterbi
            return(end_sentence(sos));
        }
예제 #3
0
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray();

            LinkedList <Eojeol> eojeolArray = new LinkedList <Eojeol>();

            for (int i = 0; i < eojeolSetArray.Count; i++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[i];

                eojeolArray.Clear();
                for (int j = 0; j < eojeolSet.Length; j++)
                {
                    eojeolArray.AddLast(eojeolSet[j]);
                }

                int unkCount = 0;
                for (int j = 0; j < eojeolArray.Count; j++)
                {
                    Eojeol          eojeol    = eojeolArray.Get_Renamed(j);
                    System.String[] tags      = eojeol.Tags;
                    System.String[] morphemes = eojeol.Morphemes;

                    for (int k = 0; k < tags.Length; k++)
                    {
                        if (tags[k].Equals("unk"))
                        {
                            tags[k] = "nqq";

                            Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]);
                            eojeolArray.AddLast(newEojeol);

                            tags[k] = "ncn";
                            unkCount++;
                        }
                    }
                }

                if (unkCount > 0)
                {
                    eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet);
                }
            }

            return(sos);
        }
예제 #4
0
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray();

            LinkedList < Eojeol > eojeolArray = new LinkedList < Eojeol >();

            for (int i = 0; i < eojeolSetArray.Count; i++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[i];

                eojeolArray.Clear();
                for (int j = 0; j < eojeolSet.Length; j++)
                {
                    eojeolArray.AddLast(eojeolSet[j]);
                }

                int unkCount = 0;
                for (int j = 0; j < eojeolArray.Count; j++)
                {
                    Eojeol eojeol = eojeolArray.Get_Renamed(j);
                    System.String[] tags = eojeol.Tags;
                    System.String[] morphemes = eojeol.Morphemes;

                    for (int k = 0; k < tags.Length; k++)
                    {
                        if (tags[k].Equals("unk"))
                        {
                            tags[k] = "nqq";

                            Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]);
                            eojeolArray.AddLast(newEojeol);

                            tags[k] = "ncn";
                            unkCount++;
                        }
                    }
                }

                if (unkCount > 0)
                {
                    eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet);
                }
            }

            return sos;
        }
예제 #5
0
        override public void Run()
        {
            SetOfSentences sos = null;

            try
            {
                while (true)
                {
                    sos = in_Renamed.Take();

                    if ((sos = morphProcessor.doProcess(sos)) != null)
                    {
                        out_Renamed.Add(sos);
                    }
                }
            }
            catch (System.Threading.ThreadInterruptedException e)
            {
                morphProcessor.shutdown();
            }
        }
예제 #6
0
        override public void Run()
        {
            SetOfSentences sos  = null;
            Sentence       sent = null;

            try
            {
                while (true)
                {
                    sos = in_Renamed.Take();

                    if ((sent = tagger.tagPOS(sos)) != null)
                    {
                        out_Renamed.Add(sent);
                    }
                }
            }
            catch (System.Threading.ThreadInterruptedException e)
            {
                tagger.shutdown();
            }
        }
예제 #7
0
        override public void Run()
        {
            PlainSentence  ps  = null;
            SetOfSentences sos = null;

            try
            {
                while (true)
                {
                    ps = in_Renamed.Take();

                    if ((sos = ma.morphAnalyze(ps)) != null)
                    {
                        out_Renamed.Add(sos);
                    }
                }
            }
            catch (System.Threading.ThreadInterruptedException e)
            {
                ma.shutdown();
            }
        }
예제 #8
0
        /// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary>
        /// <returns> all the possible morphological analysis results
        /// </returns>
        public virtual SetOfSentences morphAnalyze(PlainSentence ps)
        {
            StringTokenizer st = new StringTokenizer(ps.Sentence, " \t");

            System.String plainEojeol = null;
            int           eojeolNum   = st.Count;

            List <String>    plainEojeolArray = new List <String>(eojeolNum);
            List <Eojeol []> eojeolSetArray   = new List <Eojeol []>(eojeolNum);

            while (st.HasMoreTokens)
            {
                plainEojeol = st.NextToken;

                plainEojeolArray.Add(plainEojeol);
                eojeolSetArray.Add(processEojeol(plainEojeol));
            }

            SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray);

            sos = postProc.doPostProcessing(sos);

            return(sos);
        }
예제 #9
0
        /// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary>
        /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result
        /// </param>
        /// <returns> the simplified morphological analysis result
        /// </returns>
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray();
            List< Eojeol [] > resultSetArray = new List< Eojeol [] >();

            int len = eojeolSetArray.Count;
            System.String prevTag = null;
            bool changed = false;

            for (int pos = 0; pos < len; pos++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[pos];
                dupFilterMap.Clear();

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    System.String[] tags = eojeolSet[i].Tags;
                    prevTag = "";
                    changed = false;

                    for (int j = 0; j < tags.Length; j++)
                    {
                        tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL);

                        if (tags[j].Equals(prevTag))
                        {
                            changed = true;
                        }
                        prevTag = tags[j];
                    }

                    if (changed)
                    {
                        tagList.Clear();
                        morphemeList.Clear();
                        System.String[] morphemes = eojeolSet[i].Morphemes;

                        for (int j = 0; j < tags.Length - 1; j++)
                        {
                            if (tags[j].Equals(tags[j + 1]))
                            {
                                morphemes[j + 1] = morphemes[j] + morphemes[j + 1];
                            }
                            else
                            {
                                tagList.Add(tags[j]);
                                morphemeList.Add(morphemes[j]);
                            }
                        }
                        tagList.Add(tags[tags.Length - 1]);
                        morphemeList.Add(morphemes[morphemes.Length - 1]);

                        eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray());
                    }

                    System.String key = eojeolSet[i].ToString();
                    if (!dupFilterMap.ContainsKey(key))
                    {
                        dupFilterMap[key] = eojeolSet[i];
                    }
                }
                if (eojeolSet.Length != dupFilterMap.Count)
                {
                    resultSetArray.Add(dupFilterMap.Values.ToArray());
                }
                else
                {
                    resultSetArray.Add(eojeolSet);
                }
            }

            sos.setEojeolSetArray(resultSetArray);
            return sos;
        }
예제 #10
0
        /// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary>
        /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result
        /// </param>
        /// <returns> the simplified morphological analysis result
        /// </returns>
        public virtual SetOfSentences doProcess(SetOfSentences sos)
        {
            List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray();
            List <Eojeol []> resultSetArray = new List <Eojeol []>();

            int len = eojeolSetArray.Count;

            System.String prevTag = null;
            bool          changed = false;

            for (int pos = 0; pos < len; pos++)
            {
                Eojeol[] eojeolSet = eojeolSetArray[pos];
                dupFilterMap.Clear();

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    System.String[] tags = eojeolSet[i].Tags;
                    prevTag = "";
                    changed = false;

                    for (int j = 0; j < tags.Length; j++)
                    {
                        tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL);

                        if (tags[j].Equals(prevTag))
                        {
                            changed = true;
                        }
                        prevTag = tags[j];
                    }

                    if (changed)
                    {
                        tagList.Clear();
                        morphemeList.Clear();
                        System.String[] morphemes = eojeolSet[i].Morphemes;

                        for (int j = 0; j < tags.Length - 1; j++)
                        {
                            if (tags[j].Equals(tags[j + 1]))
                            {
                                morphemes[j + 1] = morphemes[j] + morphemes[j + 1];
                            }
                            else
                            {
                                tagList.Add(tags[j]);
                                morphemeList.Add(morphemes[j]);
                            }
                        }
                        tagList.Add(tags[tags.Length - 1]);
                        morphemeList.Add(morphemes[morphemes.Length - 1]);

                        eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray());
                    }

                    System.String key = eojeolSet[i].ToString();
                    if (!dupFilterMap.ContainsKey(key))
                    {
                        dupFilterMap[key] = eojeolSet[i];
                    }
                }
                if (eojeolSet.Length != dupFilterMap.Count)
                {
                    resultSetArray.Add(dupFilterMap.Values.ToArray());
                }
                else
                {
                    resultSetArray.Add(eojeolSet);
                }
            }

            sos.setEojeolSetArray(resultSetArray);
            return(sos);
        }
예제 #11
0
        /// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary>
        /// <returns> all the possible morphological analysis results
        /// </returns>
        public virtual SetOfSentences morphAnalyze(PlainSentence ps)
        {
            StringTokenizer st = new StringTokenizer(ps.Sentence, " \t");

            System.String plainEojeol = null;
            int eojeolNum = st.Count;

            List< String > plainEojeolArray = new List< String >(eojeolNum);
            List< Eojeol [] > eojeolSetArray = new List< Eojeol [] >(eojeolNum);

            while (st.HasMoreTokens)
            {
                plainEojeol = st.NextToken;

                plainEojeolArray.Add(plainEojeol);
                eojeolSetArray.Add(processEojeol(plainEojeol));
            }

            SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray);

            sos = postProc.doPostProcessing(sos);

            return sos;
        }
예제 #12
0
        /// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary>
        /// <param name="sos">- all the candidates of morphological analysis
        /// </param>
        /// <returns> the final morphological analysis result which has the highest probability
        /// </returns>
        private Sentence end_sentence(SetOfSentences sos)
        {
            int i, j, k;

            /* Ceartes the last node */
            i = new_wp(" ");
            wp[i].MNode = new_mnode(null, "SF", 0);

            /* Runs viterbi */
            for (i = 1; i < wp_end - 1; i++)
            {
                for (j = wp[i].MNode; j != 0; j = mn[j].Sibling)
                {
                    for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling)
                    {
                        update_prob_score(j, k);
                    }
                }
            }

            i = sos.length;
            Eojeol[] eojeols = new Eojeol[i];
            for (k = wp[i].MNode; k != 0; k = mn[k].Backptr)
            {
                eojeols[--i] = mn[k].Eojeol;
            }

            return new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols);
        }
예제 #13
0
        public virtual Sentence tagPOS(SetOfSentences sos)
        {
            int v = 0, prev_v = 0, w = 0;
            List<string> plainEojeolArray = sos.getPlainEojeolArray();
            List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray();

            // initialization
            reset();

            IEnumerator<string> plainEojeolIter = plainEojeolArray.GetEnumerator();
            foreach (Eojeol [] eojeolSet in eojeolSetArray)
            {
                System.String plainEojeol = null;
                if (plainEojeolIter.MoveNext())
                {
                    plainEojeol = plainEojeolIter.Current;
                }
                else
                {
                    break;
                }
                w = new_wp(plainEojeol);

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    System.String now_tag;
                    double probability;

                    now_tag = PhraseTag.getPhraseTag(eojeolSet[i].Tags);
                    probability = compute_wt(eojeolSet[i]);

                    v = new_mnode(eojeolSet[i], now_tag, probability);
                    if (i == 0)
                    {
                        wp[w].MNode = v;
                        prev_v = v;
                    }
                    else
                    {
                        mn[prev_v].Sibling = v;
                        prev_v = v;
                    }
                }
            }

            // gets the final result by running viterbi
            return end_sentence(sos);
        }
예제 #14
0
        /// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary>
        /// <param name="sos">- the result of morphological analysis
        /// </param>
        /// <returns> the result of morphological analysis with post processing
        /// </returns>
        public virtual SetOfSentences doPostProcessing(SetOfSentences sos)
        {
            List<Eojeol[]> eojeolSetArray = sos.getEojeolSetArray();

            IEnumerator<Eojeol[]> iter = eojeolSetArray.GetEnumerator();

            while (iter.MoveNext())
            {
                Eojeol[] eojeolSet = iter.Current;
                System.String prevMorph = "";

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    Eojeol eojeol = eojeolSet[i];
                    System.String[] morphemes = eojeol.Morphemes;
                    System.String[] tags = eojeol.Tags;

                    for (int j = 0; j < eojeol.length; j++)
                    {
                        System.String tri = Code.toTripleString(morphemes[j]);
                        if (tags[j].StartsWith("e"))
                        {
                            int prevLen = prevMorph.Length;

                            if (tri.StartsWith(A_))
                            {
                                /* 어 -> 아 */
                                if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4]))))
                                {
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                                else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false))
                                {
                                    /* for 'ㅂ' irregular */
                                }
                                else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA))
                                {
                                }
                                else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2]))))
                                {
                                    // final consonant or not
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                            }
                            else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0))))
                            {
                                /* elision of '으', '스', '느' */
                                if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF))
                                {
                                    morphemes[j] = Code.toString(tri.Substring(2).ToCharArray());
                                }
                            }
                        }

                        prevMorph = Code.toTripleString(morphemes[j]);
                    }
                }
            }

            return sos;
        }
예제 #15
0
        /// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary>
        /// <param name="sos">- the result of morphological analysis
        /// </param>
        /// <returns> the result of morphological analysis with post processing
        /// </returns>
        public virtual SetOfSentences doPostProcessing(SetOfSentences sos)
        {
            List <Eojeol[]> eojeolSetArray = sos.getEojeolSetArray();

            IEnumerator <Eojeol[]> iter = eojeolSetArray.GetEnumerator();

            while (iter.MoveNext())
            {
                Eojeol[]      eojeolSet = iter.Current;
                System.String prevMorph = "";

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    Eojeol          eojeol    = eojeolSet[i];
                    System.String[] morphemes = eojeol.Morphemes;
                    System.String[] tags      = eojeol.Tags;

                    for (int j = 0; j < eojeol.length; j++)
                    {
                        System.String tri = Code.toTripleString(morphemes[j]);
                        if (tags[j].StartsWith("e"))
                        {
                            int prevLen = prevMorph.Length;

                            if (tri.StartsWith(A_))
                            {
                                /* 어 -> 아 */
                                if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4]))))
                                {
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                                else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false))
                                {
                                    /* for 'ㅂ' irregular */
                                }
                                else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA))
                                {
                                }
                                else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2]))))
                                {
                                    // final consonant or not
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                            }
                            else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0))))
                            {
                                /* elision of '으', '스', '느' */
                                if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF))
                                {
                                    morphemes[j] = Code.toString(tri.Substring(2).ToCharArray());
                                }
                            }
                        }

                        prevMorph = Code.toTripleString(morphemes[j]);
                    }
                }
            }

            return(sos);
        }