/// <summary> It reads the morpheme dictionary file, and initializes the trie structure.</summary> /// <param name="dictionaryFileName">- the file path of the morpheme dictionary /// </param> /// <param name="tagSet">- the morpheme tag set /// </param> /// <throws> IOException </throws> public virtual void read_dic(System.String dictionaryFileName, TagSet tagSet) { System.String str = ""; System.IO.StreamReader in_Renamed = new System.IO.StreamReader( new System.IO.FileStream(dictionaryFileName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.UTF8); INFO[] info_list = new INFO[255]; for (int i = 0; i < 255; i++) { info_list[i] = new INFO(this); } while ((str = in_Renamed.ReadLine()) != null) { str.Trim(); if (str.Equals("")) { continue; } StringTokenizer tok = new StringTokenizer(str, "\t "); System.String word = tok.NextToken; int isize = 0; while (tok.HasMoreTokens) { System.String data = tok.NextToken; StringTokenizer tok2 = new StringTokenizer(data, "."); System.String curt = tok2.NextToken; int x = tagSet.getTagID(curt); if (x == -1) { System.Console.Error.WriteLine("read_dic:tag error"); continue; } if (tok2.HasMoreTokens) { info_list[isize].phoneme = (short)tagSet.getIrregularID(tok2.NextToken); } else { info_list[isize].phoneme = TagSet.PHONEME_TYPE_ALL; } info_list[isize].tag = x; isize++; } info_list[isize].tag = 0; info_list[isize].phoneme = 0; char[] word3 = Code.toTripleArray(word); for (int i = 0; i < isize; i++) { store(word3, info_list[i]); } } }
/// <summary> It prints the segment position information to the console.</summary> public virtual void printPosition() { System.Console.Error.WriteLine("positionEnd: " + positionEnd); for (int i = 0; i < positionEnd; i++) { Trace.WriteLine( string.Format("position[{0}].key={1} nextPosition={2}", i, Code.toCompatibilityJamo(position[i].key), position[i].nextPosition)); } }
/// <summary> Constructor.</summary> public PostProcessor() { HA = Code.toTripleString("하"); AR = Code.toTripleString("아"); A_ = Code.toTripleString("어"); PV = Code.toTripleString("ㅏㅑㅗ"); XEU = Code.toTripleString("끄뜨쓰크트"); DOB = Code.toTripleString("돕"); GOB = Code.toTripleString("곱"); EU = Code.toTripleString("으"); SU = Code.toTripleString("습니"); NU = Code.toTripleString("는다"); }
/// <summary> Initializes the morpheme chart with the specified word.</summary> /// <param name="word">- the plain string of an eojeol to analyze /// </param> public virtual void init(System.String word) { simti.init(); word = preReplace(word); sp.init(Code.toTripleString(word), simti); chartEnd = 0; Position p = sp.getPosition(0); p.morpheme[p.morphCount++] = chartEnd; chart[chartEnd].tag = tagSet.iwgTag; chart[chartEnd].phoneme = 0; chart[chartEnd].nextPosition = 1; chart[chartEnd].nextTagType = 0; chart[chartEnd].state = MORPHEME_STATE_SUCCESS; chart[chartEnd].connectionCount = 0; chart[chartEnd].str = ""; chartEnd++; }
/// <summary> It prints the trie structure by recursive call.</summary> /// <param name="pw">- for printing the trie structure /// </param> /// <param name="idx">- the index of trie node /// </param> /// <param name="depth">- the depth of current node /// </param> /// <param name="tagSet">- the morpheme tag set used in the trie structure /// </param> public virtual void print_trie(System.IO.StreamWriter pw, int idx, int depth, TagSet tagSet) { for (int i = 0; i < depth; i++) { pw.Write("\t"); } pw.Write(idx + ":" + Code.toCompatibilityJamo(trie_buf[idx].key) + " "); if (trie_buf[idx].info_list != null) { for (int k = 0; k < trie_buf[idx].info_list.Count; k++) { pw.Write("t:" + tagSet.getTagName(trie_buf[idx].info_list.Get_Renamed(k).tag) + " "); } } pw.WriteLine(); for (int i = 0; i < trie_buf[idx].child_size; i++) { print_trie(pw, trie_buf[idx].child_idx + i, depth + 1, tagSet); } }
/// <summary> It prints the all data in the chart to the console.</summary> public virtual void printMorphemeAll() { System.Console.Error.WriteLine("chartEnd: " + chartEnd); for (int i = 0; i < chartEnd; i++) { System.Console.Error.WriteLine("chartID: " + i); Trace.Write( string.Format("{0}/{1}.{2} nextPosition={3} nextTagType={4} state={5} ", Code.toString(chart[i].str.ToCharArray()), tagSet.getTagName(chart[i].tag), tagSet.getIrregularName(chart[i].phoneme), Code.toCompatibilityJamo(sp.getPosition(chart[i].nextPosition).key), tagSet.getTagName(chart[i].nextTagType), chart[i].state)); System.Console.Error.Write("connection="); for (int j = 0; j < chart[i].connectionCount; j++) { Trace.Write(chart[i].connection[j] + ", "); } Trace.Write(Environment.NewLine); } }
/// <summary> It generates the final mophological analysis result from the morpheme chart.</summary> /// <param name="chartIndex">- the start index of the chart to generate final result /// </param> private void printChart(int chartIndex) { int i; Morpheme morph = chart[chartIndex]; int engCnt = 0; int chiCnt = 0; if (chartIndex == 0) { for (i = 0; i < morph.connectionCount; i++) { resMorphemes.Clear(); resTags.Clear(); printChart(morph.connection[i]); } } else { System.String morphStr = Code.toString(morph.str.ToCharArray()); int idx = 0; engCnt = 0; chiCnt = 0; while (idx != -1) { if ((idx = morphStr.IndexOf(ENG_REPLACE)) != -1) { engCnt++; morphStr = morphStr.ReplaceFirst(ENG_REPLACE, engReplacementList.Get_Renamed(engReplaceIndex++)); } else if ((idx = morphStr.IndexOf(CHI_REPLACE)) != -1) { chiCnt++; morphStr = morphStr.ReplaceFirst(CHI_REPLACE, chiReplacementList.Get_Renamed(chiReplaceIndex++)); } } resMorphemes.Add(morphStr); resTags.Add(tagSet.getTagName(morph.tag)); for (i = 0; i < morph.connectionCount && printResultCnt < MAX_CANDIDATE_NUM; i++) { if (morph.connection[i] == 0) { System.String[] mArray = resMorphemes.ToArray(); System.String[] tArray = resTags.ToArray(); resEojeols.AddLast(new Eojeol(mArray, tArray)); printResultCnt++; } else { printChart(morph.connection[i]); } } resMorphemes.RemoveAt(resMorphemes.Count - 1); resTags.RemoveAt(resTags.Count - 1); if (engCnt > 0) { engReplaceIndex -= engCnt; } if (chiCnt > 0) { chiReplaceIndex -= chiCnt; } } }
/// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary> /// <param name="sos">- the result of morphological analysis /// </param> /// <returns> the result of morphological analysis with post processing /// </returns> public virtual SetOfSentences doPostProcessing(SetOfSentences sos) { List <Eojeol[]> eojeolSetArray = sos.getEojeolSetArray(); IEnumerator <Eojeol[]> iter = eojeolSetArray.GetEnumerator(); while (iter.MoveNext()) { Eojeol[] eojeolSet = iter.Current; System.String prevMorph = ""; for (int i = 0; i < eojeolSet.Length; i++) { Eojeol eojeol = eojeolSet[i]; System.String[] morphemes = eojeol.Morphemes; System.String[] tags = eojeol.Tags; for (int j = 0; j < eojeol.length; j++) { System.String tri = Code.toTripleString(morphemes[j]); if (tags[j].StartsWith("e")) { int prevLen = prevMorph.Length; if (tri.StartsWith(A_)) { /* 어 -> 아 */ if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4])))) { morphemes[j] = Code.toString(AR.ToCharArray()); } else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false)) { /* for 'ㅂ' irregular */ } else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA)) { } else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2])))) { // final consonant or not morphemes[j] = Code.toString(AR.ToCharArray()); } } else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0)))) { /* elision of '으', '스', '느' */ if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF)) { morphemes[j] = Code.toString(tri.Substring(2).ToCharArray()); } } } prevMorph = Code.toTripleString(morphemes[j]); } } } return(sos); }
/// <summary> It expands the morpheme chart regarding the irregular rules about 'ㄷ', 'ㅅ', 'ㅂ', 'ㅎ', '르', '러'.</summary> /// <param name="from">- the start index for the segment position /// </param> /// <param name="prev">- the passed part of the string /// </param> /// <param name="str">- the next part of the string to check /// </param> /// <param name="cur">- the current index of the string for checking the rules /// </param> private void rule_irr_word(int from, System.String prev, System.String str, int cur) { System.String buf; System.String buf2; System.String new_str; int len = str.Length; /* 'ᆮ' irregular rule */ if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l21") != 0) && pcheck(str, cur, "21") != 0 && pcheck(str, cur + 1, "r21") != 0) { new_str = replace(str, cur - 1, "ᆮ"); buf = new_str.Substring(0, (cur) - (0)); buf2 = new_str.Substring(cur); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_D); } /* 'ᆺ' irregular rule */ if ((cur > 0 && cur < len && pcheck(str, cur - 1, "l22") != 0) && pcheck(str, cur, "22") != 0 && pcheck(str, cur + 1, "r22") != 0) { new_str = insert(str, cur, "ᆺ"); buf = new_str.Substring(0, (cur + 1) - (0)); buf2 = new_str.Substring(cur + 1); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_S); } /* 'ㅂ' irregular rule */ if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l23") != 0) && pcheck(str, cur, "23") != 0 && pcheck(str, cur + 1, "r23") != 0) { new_str = replace(str, cur, "ᅳ"); new_str = insert(new_str, cur - 1, "ᆸ"); buf = new_str.Substring(0, (cur) - (0)); buf2 = new_str.Substring(cur); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_B); } /* 'ᆸ' irregular rule */ if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l24") != 0) && pcheck(str, cur, "24") != 0 && pcheck(str, cur + 1, "r24") != 0) { new_str = replace(str, cur, "ᅥ"); new_str = insert(new_str, cur - 1, "ᆸ"); buf = new_str.Substring(0, (cur) - (0)); buf2 = new_str.Substring(cur); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_B); } /* 'ㅂ' irregular rule */ if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l25") != 0) && pcheck(str, cur, "25") != 0 && pcheck(str, cur + 1, "r25") != 0) { new_str = replace(str, cur, "ᅥ"); new_str = insert(new_str, cur - 1, "ᆸ"); buf = new_str.Substring(0, (cur) - (0)); buf2 = new_str.Substring(cur); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_B); } /* 'ᇂ' irregular rule */ if ((cur > 0 && cur + 1 < len && pcheck(str, cur - 1, "l26") != 0) && pcheck(str, cur, "26") != 0 && pcheck(str, cur + 1, "r26") != 0) { new_str = insert(str, cur + 1, "ᇂ으"); buf = new_str.Substring(0, (cur + 2) - (0)); buf2 = new_str.Substring(cur + 2); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H); } /* 'ㅎ' irregular rule */ if ((cur > 0 && cur + 1 < len && pcheck(str, cur - 1, "l27") != 0) && pcheck(str, cur, "27") != 0 && pcheck(str, cur + 1, "r27") != 0) { if (str[cur] == 'ᅢ') { new_str = replace(str, cur, "ᅡ"); } else { new_str = replace(str, cur, "ᅣ"); } new_str = insert(new_str, cur + 1, "ᇂ어"); buf = new_str.Substring(0, (cur + 2) - (0)); buf2 = new_str.Substring(cur + 2); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H); // 이운재 추가 if (str[cur] == 'ᅢ') { new_str = replace(str, cur, "ᅥ"); } else { new_str = replace(str, cur, "ᅧ"); } new_str = insert(new_str, cur + 1, "ᇂ어"); buf = new_str.Substring(0, (cur + 2) - (0)); buf2 = new_str.Substring(cur + 2); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H); } /* 'ㅎ' irregular rule */ if ((cur > 0 && cur + 1 < len && pcheck(str, cur - 1, "l28") != 0) && pcheck(str, cur, "28") != 0 && pcheck(str, cur + 1, "r28") != 0) { new_str = replace(str, cur, "ᅥ"); new_str = insert(new_str, cur + 1, "ᇂᄋ"); buf = new_str.Substring(0, (cur + 2) - (0)); buf2 = new_str.Substring(cur + 2); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_H); } /* '르' irregular rule */ if ((cur > 0 && cur < len && pcheck(str, cur - 1, "l29") != 0) && pcheck(str, cur, "29") != 0 && pcheck(str, cur + 1, "r29") != 0) { new_str = replace(str, cur, "ᅳ"); if (new_str[cur + 1] == 'ᅡ') { new_str = new_str.Substring(0, (cur + 1) - (0)) + 'ᅥ' + new_str.Substring(cur + 2); } new_str = insert(new_str, cur + 1, "ᄋ"); new_str = new_str.Substring(0, (cur - 1) - (0)) + Code.toChoseong(new_str[cur - 1]) + new_str.Substring(cur); buf = new_str.Substring(0, (cur + 1) - (0)); buf2 = new_str.Substring(cur + 1); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_REU); } /* '러' irregular rule */ if ((cur > 0 && cur <= len && pcheck(str, cur - 1, "l30") != 0) && pcheck(str, cur, "30") != 0 && pcheck(str, cur + 1, "r30") != 0 && (cur - 2 >= 0 && str[cur - 2] == 'ᄅ')) { new_str = replace(str, cur, "ᄋ"); buf = new_str.Substring(0, (cur) - (0)); buf2 = new_str.Substring(cur); // System.out.println("Prev: " + Code.toString(prev.toCharArray()) + ", " + "Str: " + Code.toString(str.toCharArray()) + ", " + "Cur: " + cur); mc.phonemeChange(from, buf, buf2, TagSet.TAG_TYPE_YONGS, TagSet.TAG_TYPE_EOMIES, tagSet.IRR_TYPE_REO); } }