Example #1
0
        /// <summary> It reads the morpheme dictionary file, and initializes the trie structure.</summary>
        /// <param name="dictionaryFileName">- the file path of the morpheme dictionary
        /// </param>
        /// <param name="tagSet">- the morpheme tag set
        /// </param>
        /// <throws>  IOException </throws>
        public virtual void  read_dic(System.String dictionaryFileName, TagSet tagSet)
        {
            System.String str = "";

            System.IO.StreamReader in_Renamed = new System.IO.StreamReader(
                new System.IO.FileStream(dictionaryFileName, System.IO.FileMode.Open, System.IO.FileAccess.Read),
                System.Text.Encoding.UTF8);
            INFO[] info_list = new INFO[255];
            for (int i = 0; i < 255; i++)
            {
                info_list[i] = new INFO(this);
            }

            while ((str = in_Renamed.ReadLine()) != null)
            {
                str.Trim();
                if (str.Equals(""))
                {
                    continue;
                }

                StringTokenizer tok   = new StringTokenizer(str, "\t ");
                System.String   word  = tok.NextToken;
                int             isize = 0;

                while (tok.HasMoreTokens)
                {
                    System.String   data = tok.NextToken;
                    StringTokenizer tok2 = new StringTokenizer(data, ".");
                    System.String   curt = tok2.NextToken;
                    int             x    = tagSet.getTagID(curt);
                    if (x == -1)
                    {
                        System.Console.Error.WriteLine("read_dic:tag error");
                        continue;
                    }

                    if (tok2.HasMoreTokens)
                    {
                        info_list[isize].phoneme = (short)tagSet.getIrregularID(tok2.NextToken);
                    }
                    else
                    {
                        info_list[isize].phoneme = TagSet.PHONEME_TYPE_ALL;
                    }

                    info_list[isize].tag = x;
                    isize++;
                }
                info_list[isize].tag     = 0;
                info_list[isize].phoneme = 0;

                char[] word3 = Code.toTripleArray(word);
                for (int i = 0; i < isize; i++)
                {
                    store(word3, info_list[i]);
                }
            }
        }
Example #2
0
 /// <summary> It prints the segment position information to the console.</summary>
 public virtual void  printPosition()
 {
     System.Console.Error.WriteLine("positionEnd: " + positionEnd);
     for (int i = 0; i < positionEnd; i++)
     {
         Trace.WriteLine(
             string.Format("position[{0}].key={1} nextPosition={2}", i, Code.toCompatibilityJamo(position[i].key), position[i].nextPosition));
     }
 }
Example #3
0
 /// <summary> Constructor.</summary>
 public PostProcessor()
 {
     HA  = Code.toTripleString("하");
     AR  = Code.toTripleString("아");
     A_  = Code.toTripleString("어");
     PV  = Code.toTripleString("ㅏㅑㅗ");
     XEU = Code.toTripleString("끄뜨쓰크트");
     DOB = Code.toTripleString("돕");
     GOB = Code.toTripleString("곱");
     EU  = Code.toTripleString("으");
     SU  = Code.toTripleString("습니");
     NU  = Code.toTripleString("는다");
 }
Example #4
0
        /// <summary> Initializes the morpheme chart with the specified word.</summary>
        /// <param name="word">- the plain string of an eojeol to analyze
        /// </param>
        public virtual void  init(System.String word)
        {
            simti.init();
            word = preReplace(word);
            sp.init(Code.toTripleString(word), simti);

            chartEnd = 0;
            Position p = sp.getPosition(0);

            p.morpheme[p.morphCount++]      = chartEnd;
            chart[chartEnd].tag             = tagSet.iwgTag;
            chart[chartEnd].phoneme         = 0;
            chart[chartEnd].nextPosition    = 1;
            chart[chartEnd].nextTagType     = 0;
            chart[chartEnd].state           = MORPHEME_STATE_SUCCESS;
            chart[chartEnd].connectionCount = 0;
            chart[chartEnd].str             = "";
            chartEnd++;
        }
Example #5
0
 /// <summary> It prints the trie structure by recursive call.</summary>
 /// <param name="pw">- for printing the trie structure
 /// </param>
 /// <param name="idx">- the index of trie node
 /// </param>
 /// <param name="depth">- the depth of current node
 /// </param>
 /// <param name="tagSet">- the morpheme tag set used in the trie structure
 /// </param>
 public virtual void  print_trie(System.IO.StreamWriter pw, int idx, int depth, TagSet tagSet)
 {
     for (int i = 0; i < depth; i++)
     {
         pw.Write("\t");
     }
     pw.Write(idx + ":" + Code.toCompatibilityJamo(trie_buf[idx].key) + " ");
     if (trie_buf[idx].info_list != null)
     {
         for (int k = 0; k < trie_buf[idx].info_list.Count; k++)
         {
             pw.Write("t:" + tagSet.getTagName(trie_buf[idx].info_list.Get_Renamed(k).tag) + " ");
         }
     }
     pw.WriteLine();
     for (int i = 0; i < trie_buf[idx].child_size; i++)
     {
         print_trie(pw, trie_buf[idx].child_idx + i, depth + 1, tagSet);
     }
 }
Example #6
0
        /// <summary> It prints the all data in the chart to the console.</summary>
        public virtual void  printMorphemeAll()
        {
            System.Console.Error.WriteLine("chartEnd: " + chartEnd);
            for (int i = 0; i < chartEnd; i++)
            {
                System.Console.Error.WriteLine("chartID: " + i);
                Trace.Write(
                    string.Format("{0}/{1}.{2} nextPosition={3} nextTagType={4} state={5} ", Code.toString(chart[i].str.ToCharArray()), tagSet.getTagName(chart[i].tag), tagSet.getIrregularName(chart[i].phoneme), Code.toCompatibilityJamo(sp.getPosition(chart[i].nextPosition).key), tagSet.getTagName(chart[i].nextTagType), chart[i].state));

                System.Console.Error.Write("connection=");
                for (int j = 0; j < chart[i].connectionCount; j++)
                {
                    Trace.Write(chart[i].connection[j] + ", ");
                }
                Trace.Write(Environment.NewLine);
            }
        }
Example #7
0
        /// <summary> It generates the final mophological analysis result from the morpheme chart.</summary>
        /// <param name="chartIndex">- the start index of the chart to generate final result
        /// </param>
        private void  printChart(int chartIndex)
        {
            int      i;
            Morpheme morph  = chart[chartIndex];
            int      engCnt = 0;
            int      chiCnt = 0;

            if (chartIndex == 0)
            {
                for (i = 0; i < morph.connectionCount; i++)
                {
                    resMorphemes.Clear();
                    resTags.Clear();
                    printChart(morph.connection[i]);
                }
            }
            else
            {
                System.String morphStr = Code.toString(morph.str.ToCharArray());
                int           idx      = 0;
                engCnt = 0;
                chiCnt = 0;
                while (idx != -1)
                {
                    if ((idx = morphStr.IndexOf(ENG_REPLACE)) != -1)
                    {
                        engCnt++;
                        morphStr = morphStr.ReplaceFirst(ENG_REPLACE, engReplacementList.Get_Renamed(engReplaceIndex++));
                    }
                    else if ((idx = morphStr.IndexOf(CHI_REPLACE)) != -1)
                    {
                        chiCnt++;
                        morphStr = morphStr.ReplaceFirst(CHI_REPLACE, chiReplacementList.Get_Renamed(chiReplaceIndex++));
                    }
                }

                resMorphemes.Add(morphStr);
                resTags.Add(tagSet.getTagName(morph.tag));

                for (i = 0; i < morph.connectionCount && printResultCnt < MAX_CANDIDATE_NUM; i++)
                {
                    if (morph.connection[i] == 0)
                    {
                        System.String[] mArray = resMorphemes.ToArray();
                        System.String[] tArray = resTags.ToArray();
                        resEojeols.AddLast(new Eojeol(mArray, tArray));

                        printResultCnt++;
                    }
                    else
                    {
                        printChart(morph.connection[i]);
                    }
                }

                resMorphemes.RemoveAt(resMorphemes.Count - 1);
                resTags.RemoveAt(resTags.Count - 1);
                if (engCnt > 0)
                {
                    engReplaceIndex -= engCnt;
                }
                if (chiCnt > 0)
                {
                    chiReplaceIndex -= chiCnt;
                }
            }
        }
Example #8
0
        /// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary>
        /// <param name="sos">- the result of morphological analysis
        /// </param>
        /// <returns> the result of morphological analysis with post processing
        /// </returns>
        public virtual SetOfSentences doPostProcessing(SetOfSentences sos)
        {
            List <Eojeol[]> eojeolSetArray = sos.getEojeolSetArray();

            IEnumerator <Eojeol[]> iter = eojeolSetArray.GetEnumerator();

            while (iter.MoveNext())
            {
                Eojeol[]      eojeolSet = iter.Current;
                System.String prevMorph = "";

                for (int i = 0; i < eojeolSet.Length; i++)
                {
                    Eojeol          eojeol    = eojeolSet[i];
                    System.String[] morphemes = eojeol.Morphemes;
                    System.String[] tags      = eojeol.Tags;

                    for (int j = 0; j < eojeol.length; j++)
                    {
                        System.String tri = Code.toTripleString(morphemes[j]);
                        if (tags[j].StartsWith("e"))
                        {
                            int prevLen = prevMorph.Length;

                            if (tri.StartsWith(A_))
                            {
                                /* 어 -> 아 */
                                if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4]))))
                                {
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                                else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false))
                                {
                                    /* for 'ㅂ' irregular */
                                }
                                else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA))
                                {
                                }
                                else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2]))))
                                {
                                    // final consonant or not
                                    morphemes[j] = Code.toString(AR.ToCharArray());
                                }
                            }
                            else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0))))
                            {
                                /* elision of '으', '스', '느' */
                                if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF))
                                {
                                    morphemes[j] = Code.toString(tri.Substring(2).ToCharArray());
                                }
                            }
                        }

                        prevMorph = Code.toTripleString(morphemes[j]);
                    }
                }
            }

            return(sos);
        }
Example #9
0
        /// <summary> It expands the morpheme chart regarding the irregular rules about 'ㄷ', 'ㅅ', 'ㅂ', 'ㅎ', '르', '러'.</summary>
        /// <param name="from">- the start index for the segment position
        /// </param>
        /// <param name="prev">- the passed part of the string
        /// </param>
        /// <param name="str">- the next part of the string to check
        /// </param>
        /// <param name="cur">- the current index of the string for checking the rules
        /// </param>
        private void  rule_irr_word(int from, System.String prev, System.String str, int cur)
        {
            System.String buf;
            System.String buf2;
            System.String new_str;
            int           len = str.Length;

            /* 'ᆮ' irregular rule */
            if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l21") != 0) && pcheck(str, cur, "21") != 0 && pcheck(str, cur + 1, "r21") != 0)
            {
                new_str = replace(str, cur - 1, "ᆮ");
                buf     = new_str.Substring(0, (cur) - (0));
                buf2    = new_str.Substring(cur);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_D);
            }

            /* 'ᆺ' irregular rule */
            if ((cur > 0 && cur < len && pcheck(str, cur - 1, "l22") != 0) && pcheck(str, cur, "22") != 0 && pcheck(str, cur + 1, "r22") != 0)
            {
                new_str = insert(str, cur, "ᆺ");
                buf     = new_str.Substring(0, (cur + 1) - (0));
                buf2    = new_str.Substring(cur + 1);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_S);
            }

            /* 'ㅂ' irregular rule */
            if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l23") != 0) && pcheck(str, cur, "23") != 0 && pcheck(str, cur + 1, "r23") != 0)
            {
                new_str = replace(str, cur, "ᅳ");
                new_str = insert(new_str, cur - 1, "ᆸ");
                buf     = new_str.Substring(0, (cur) - (0));
                buf2    = new_str.Substring(cur);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_B);
            }

            /* 'ᆸ' irregular rule */
            if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l24") != 0) && pcheck(str, cur, "24") != 0 && pcheck(str, cur + 1, "r24") != 0)
            {
                new_str = replace(str, cur, "ᅥ");
                new_str = insert(new_str, cur - 1, "ᆸ");
                buf     = new_str.Substring(0, (cur) - (0));
                buf2    = new_str.Substring(cur);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_B);
            }

            /* 'ㅂ' irregular rule */
            if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l25") != 0) && pcheck(str, cur, "25") != 0 && pcheck(str, cur + 1, "r25") != 0)
            {
                new_str = replace(str, cur, "ᅥ");
                new_str = insert(new_str, cur - 1, "ᆸ");
                buf     = new_str.Substring(0, (cur) - (0));
                buf2    = new_str.Substring(cur);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_B);
            }

            /* 'ᇂ' irregular rule */
            if ((cur > 0 && cur + 1 < len && pcheck(str, cur - 1, "l26") != 0) && pcheck(str, cur, "26") != 0 && pcheck(str, cur + 1, "r26") != 0)
            {
                new_str = insert(str, cur + 1, "ᇂ으");
                buf     = new_str.Substring(0, (cur + 2) - (0));
                buf2    = new_str.Substring(cur + 2);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H);
            }

            /* 'ㅎ' irregular rule */
            if ((cur > 0 && cur + 1 < len && pcheck(str, cur - 1, "l27") != 0) && pcheck(str, cur, "27") != 0 && pcheck(str, cur + 1, "r27") != 0)
            {
                if (str[cur] == 'ᅢ')
                {
                    new_str = replace(str, cur, "ᅡ");
                }
                else
                {
                    new_str = replace(str, cur, "ᅣ");
                }
                new_str = insert(new_str, cur + 1, "ᇂ어");
                buf     = new_str.Substring(0, (cur + 2) - (0));
                buf2    = new_str.Substring(cur + 2);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H);
                //			이운재 추가
                if (str[cur] == 'ᅢ')
                {
                    new_str = replace(str, cur, "ᅥ");
                }
                else
                {
                    new_str = replace(str, cur, "ᅧ");
                }
                new_str = insert(new_str, cur + 1, "ᇂ어");
                buf     = new_str.Substring(0, (cur + 2) - (0));
                buf2    = new_str.Substring(cur + 2);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H);
            }

            /* 'ㅎ' irregular rule */
            if ((cur > 0 && cur + 1 < len && pcheck(str, cur - 1, "l28") != 0) && pcheck(str, cur, "28") != 0 && pcheck(str, cur + 1, "r28") != 0)
            {
                new_str = replace(str, cur, "ᅥ");
                new_str = insert(new_str, cur + 1, "ᇂᄋ");
                buf     = new_str.Substring(0, (cur + 2) - (0));
                buf2    = new_str.Substring(cur + 2);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H);
            }


            /* '르' irregular rule */
            if ((cur > 0 && cur < len && pcheck(str, cur - 1, "l29") != 0) && pcheck(str, cur, "29") != 0 && pcheck(str, cur + 1, "r29") != 0)
            {
                new_str = replace(str, cur, "ᅳ");
                if (new_str[cur + 1] == 'ᅡ')
                {
                    new_str = new_str.Substring(0, (cur + 1) - (0)) + 'ᅥ' + new_str.Substring(cur + 2);
                }
                new_str = insert(new_str, cur + 1, "ᄋ");
                new_str = new_str.Substring(0, (cur - 1) - (0)) + Code.toChoseong(new_str[cur - 1]) + new_str.Substring(cur);

                buf  = new_str.Substring(0, (cur + 1) - (0));
                buf2 = new_str.Substring(cur + 1);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_REU);
            }

            /* '러' irregular rule */
            if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l30") != 0) && pcheck(str, cur, "30") != 0 && pcheck(str, cur + 1, "r30") != 0 && (cur - 2 >= 0 && str[cur - 2] == 'ᄅ'))
            {
                new_str = replace(str, cur, "ᄋ");
                buf     = new_str.Substring(0, (cur) - (0));
                buf2    = new_str.Substring(cur);
                // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur);
                mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_REO);
            }
        }