/// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary> /// <param name="sos">- all the candidates of morphological analysis /// </param> /// <returns> the final morphological analysis result which has the highest probability /// </returns> private Sentence end_sentence(SetOfSentences sos) { int i, j, k; /* Ceartes the last node */ i = new_wp(" "); wp[i].MNode = new_mnode(null, "SF", 0); /* Runs viterbi */ for (i = 1; i < wp_end - 1; i++) { for (j = wp[i].MNode; j != 0; j = mn[j].Sibling) { for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling) { update_prob_score(j, k); } } } i = sos.length; Eojeol[] eojeols = new Eojeol[i]; for (k = wp[i].MNode; k != 0; k = mn[k].Backptr) { eojeols[--i] = mn[k].Eojeol; } return(new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols)); }
/// <summary> Adds a new node for the markov model.</summary> /// <param name="eojeol">- the eojeol to add /// </param> /// <param name="wp_tag">- the eojeol tag /// </param> /// <param name="prob">- the probability P(w|t) /// </param> /// <returns> the index of the new node /// </returns> private int new_mnode(Eojeol eojeol, System.String wp_tag, double prob) { mn[mn_end].Eojeol = eojeol; mn[mn_end].Wp_Tag = wp_tag; mn[mn_end].Prob_Wt = prob; mn[mn_end].Backptr = 0; mn[mn_end].Sibling = 0; return(mn_end++); }
/// <summary> It changes the POS tagging result with 69 KAIST tags to the simplified result with 9 tags.</summary> /// <param name="st">- the result of morphological analysis where each eojeol has more than analysis result /// </param> /// <returns> the simplified POS tagging result /// </returns> public virtual Sentence doProcess(Sentence st) { System.String prevTag = null; bool changed = false; Eojeol[] eojeolSet = st.Eojeols; for (int i = 0; i < eojeolSet.Length; i++) { System.String[] tags = eojeolSet[i].Tags; prevTag = ""; changed = false; for (int j = 0; j < tags.Length; j++) { tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL); if (tags[j].Equals(prevTag)) { changed = true; } prevTag = tags[j]; } if (changed) { tagList.Clear(); morphemeList.Clear(); System.String[] morphemes = eojeolSet[i].Morphemes; for (int j = 0; j < tags.Length - 1; j++) { if (tags[j].Equals(tags[j + 1])) { morphemes[j + 1] = morphemes[j] + morphemes[j + 1]; } else { tagList.Add(tags[j]); morphemeList.Add(morphemes[j]); } } tagList.Add(tags[tags.Length - 1]); morphemeList.Add(morphemes[morphemes.Length - 1]); eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray()); } } st.Eojeols = eojeolSet; return(st); }
public virtual SetOfSentences doProcess(SetOfSentences sos) { List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray(); LinkedList <Eojeol> eojeolArray = new LinkedList <Eojeol>(); for (int i = 0; i < eojeolSetArray.Count; i++) { Eojeol[] eojeolSet = eojeolSetArray[i]; eojeolArray.Clear(); for (int j = 0; j < eojeolSet.Length; j++) { eojeolArray.AddLast(eojeolSet[j]); } int unkCount = 0; for (int j = 0; j < eojeolArray.Count; j++) { Eojeol eojeol = eojeolArray.Get_Renamed(j); System.String[] tags = eojeol.Tags; System.String[] morphemes = eojeol.Morphemes; for (int k = 0; k < tags.Length; k++) { if (tags[k].Equals("unk")) { tags[k] = "nqq"; Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]); eojeolArray.AddLast(newEojeol); tags[k] = "ncn"; unkCount++; } } } if (unkCount > 0) { eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet); } } return(sos); }
public virtual SetOfSentences doProcess(SetOfSentences sos) { List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray(); LinkedList < Eojeol > eojeolArray = new LinkedList < Eojeol >(); for (int i = 0; i < eojeolSetArray.Count; i++) { Eojeol[] eojeolSet = eojeolSetArray[i]; eojeolArray.Clear(); for (int j = 0; j < eojeolSet.Length; j++) { eojeolArray.AddLast(eojeolSet[j]); } int unkCount = 0; for (int j = 0; j < eojeolArray.Count; j++) { Eojeol eojeol = eojeolArray.Get_Renamed(j); System.String[] tags = eojeol.Tags; System.String[] morphemes = eojeol.Morphemes; for (int k = 0; k < tags.Length; k++) { if (tags[k].Equals("unk")) { tags[k] = "nqq"; Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]); eojeolArray.AddLast(newEojeol); tags[k] = "ncn"; unkCount++; } } } if (unkCount > 0) { eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet); } } return sos; }
/// <summary> It processes the input plain eojeol by analyzing it or searching the pre-analyzed dictionary.</summary> /// <param name="plainEojeol">- plain eojeol to analyze /// </param> /// <returns> the morphologically analyzed eojeol list /// </returns> private Eojeol[] processEojeol(System.String plainEojeol) { System.String analysis = analyzedDic.get_Renamed(plainEojeol); eojeolList.Clear(); if (analysis != null) { // the eojeol was registered in the pre-analyzed dictionary StringTokenizer st = new StringTokenizer(analysis, "^"); while (st.HasMoreTokens) { System.String analyzed = st.NextToken; System.String[] tokens = analyzed.Split("\\+|/"); System.String[] morphemes = new System.String[tokens.Length / 2]; System.String[] tags = new System.String[tokens.Length / 2]; for (int i = 0, j = 0; i < morphemes.Length; i++) { morphemes[i] = tokens[j++]; tags[i] = tokens[j++]; } Eojeol eojeol = new Eojeol(morphemes, tags); eojeolList.AddLast(eojeol); } } else { // analyze the input plain eojeol chart.init(plainEojeol); chart.analyze(); chart.getResult(); } return(eojeolList.ToArray()); }
/// <summary> It changes the POS tagging result with 69 KAIST tags to the simplified result with 9 tags.</summary> /// <param name="st">- the result of morphological analysis where each eojeol has more than analysis result /// </param> /// <returns> the simplified POS tagging result /// </returns> public virtual Sentence doProcess(Sentence st) { System.String prevTag = null; bool changed = false; Eojeol[] eojeolSet = st.Eojeols; for (int i = 0; i < eojeolSet.Length; i++) { System.String[] tags = eojeolSet[i].Tags; prevTag = ""; changed = false; for (int j = 0; j < tags.Length; j++) { tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL); if (tags[j].Equals(prevTag)) { changed = true; } prevTag = tags[j]; } if (changed) { tagList.Clear(); morphemeList.Clear(); System.String[] morphemes = eojeolSet[i].Morphemes; for (int j = 0; j < tags.Length - 1; j++) { if (tags[j].Equals(tags[j + 1])) { morphemes[j + 1] = morphemes[j] + morphemes[j + 1]; } else { tagList.Add(tags[j]); morphemeList.Add(morphemes[j]); } } tagList.Add(tags[tags.Length - 1]); morphemeList.Add(morphemes[morphemes.Length - 1]); eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray()); } } st.Eojeols = eojeolSet; return st; }
/// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary> /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result /// </param> /// <returns> the simplified morphological analysis result /// </returns> public virtual SetOfSentences doProcess(SetOfSentences sos) { List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray(); List< Eojeol [] > resultSetArray = new List< Eojeol [] >(); int len = eojeolSetArray.Count; System.String prevTag = null; bool changed = false; for (int pos = 0; pos < len; pos++) { Eojeol[] eojeolSet = eojeolSetArray[pos]; dupFilterMap.Clear(); for (int i = 0; i < eojeolSet.Length; i++) { System.String[] tags = eojeolSet[i].Tags; prevTag = ""; changed = false; for (int j = 0; j < tags.Length; j++) { tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL); if (tags[j].Equals(prevTag)) { changed = true; } prevTag = tags[j]; } if (changed) { tagList.Clear(); morphemeList.Clear(); System.String[] morphemes = eojeolSet[i].Morphemes; for (int j = 0; j < tags.Length - 1; j++) { if (tags[j].Equals(tags[j + 1])) { morphemes[j + 1] = morphemes[j] + morphemes[j + 1]; } else { tagList.Add(tags[j]); morphemeList.Add(morphemes[j]); } } tagList.Add(tags[tags.Length - 1]); morphemeList.Add(morphemes[morphemes.Length - 1]); eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray()); } System.String key = eojeolSet[i].ToString(); if (!dupFilterMap.ContainsKey(key)) { dupFilterMap[key] = eojeolSet[i]; } } if (eojeolSet.Length != dupFilterMap.Count) { resultSetArray.Add(dupFilterMap.Values.ToArray()); } else { resultSetArray.Add(eojeolSet); } } sos.setEojeolSetArray(resultSetArray); return sos; }
/// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary> /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result /// </param> /// <returns> the simplified morphological analysis result /// </returns> public virtual SetOfSentences doProcess(SetOfSentences sos) { List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray(); List <Eojeol []> resultSetArray = new List <Eojeol []>(); int len = eojeolSetArray.Count; System.String prevTag = null; bool changed = false; for (int pos = 0; pos < len; pos++) { Eojeol[] eojeolSet = eojeolSetArray[pos]; dupFilterMap.Clear(); for (int i = 0; i < eojeolSet.Length; i++) { System.String[] tags = eojeolSet[i].Tags; prevTag = ""; changed = false; for (int j = 0; j < tags.Length; j++) { tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL); if (tags[j].Equals(prevTag)) { changed = true; } prevTag = tags[j]; } if (changed) { tagList.Clear(); morphemeList.Clear(); System.String[] morphemes = eojeolSet[i].Morphemes; for (int j = 0; j < tags.Length - 1; j++) { if (tags[j].Equals(tags[j + 1])) { morphemes[j + 1] = morphemes[j] + morphemes[j + 1]; } else { tagList.Add(tags[j]); morphemeList.Add(morphemes[j]); } } tagList.Add(tags[tags.Length - 1]); morphemeList.Add(morphemes[morphemes.Length - 1]); eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray()); } System.String key = eojeolSet[i].ToString(); if (!dupFilterMap.ContainsKey(key)) { dupFilterMap[key] = eojeolSet[i]; } } if (eojeolSet.Length != dupFilterMap.Count) { resultSetArray.Add(dupFilterMap.Values.ToArray()); } else { resultSetArray.Add(eojeolSet); } } sos.setEojeolSetArray(resultSetArray); return(sos); }
/// <summary> It processes the input plain eojeol by analyzing it or searching the pre-analyzed dictionary.</summary> /// <param name="plainEojeol">- plain eojeol to analyze /// </param> /// <returns> the morphologically analyzed eojeol list /// </returns> private Eojeol[] processEojeol(System.String plainEojeol) { System.String analysis = analyzedDic.get_Renamed(plainEojeol); eojeolList.Clear(); if (analysis != null) { // the eojeol was registered in the pre-analyzed dictionary StringTokenizer st = new StringTokenizer(analysis, "^"); while (st.HasMoreTokens) { System.String analyzed = st.NextToken; System.String[] tokens = analyzed.Split("\\+|/"); System.String[] morphemes = new System.String[tokens.Length / 2]; System.String[] tags = new System.String[tokens.Length / 2]; for (int i = 0, j = 0; i < morphemes.Length; i++) { morphemes[i] = tokens[j++]; tags[i] = tokens[j++]; } Eojeol eojeol = new Eojeol(morphemes, tags); eojeolList.AddLast(eojeol); } } else { // analyze the input plain eojeol chart.init(plainEojeol); chart.analyze(); chart.getResult(); } return eojeolList.ToArray(); }
/// <summary> Adds a new node for the markov model.</summary> /// <param name="eojeol">- the eojeol to add /// </param> /// <param name="wp_tag">- the eojeol tag /// </param> /// <param name="prob">- the probability P(w|t) /// </param> /// <returns> the index of the new node /// </returns> private int new_mnode(Eojeol eojeol, System.String wp_tag, double prob) { mn[mn_end].Eojeol = eojeol; mn[mn_end].Wp_Tag = wp_tag; mn[mn_end].Prob_Wt = prob; mn[mn_end].Backptr = 0; mn[mn_end].Sibling = 0; return mn_end++; }
/// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary> /// <param name="sos">- all the candidates of morphological analysis /// </param> /// <returns> the final morphological analysis result which has the highest probability /// </returns> private Sentence end_sentence(SetOfSentences sos) { int i, j, k; /* Ceartes the last node */ i = new_wp(" "); wp[i].MNode = new_mnode(null, "SF", 0); /* Runs viterbi */ for (i = 1; i < wp_end - 1; i++) { for (j = wp[i].MNode; j != 0; j = mn[j].Sibling) { for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling) { update_prob_score(j, k); } } } i = sos.length; Eojeol[] eojeols = new Eojeol[i]; for (k = wp[i].MNode; k != 0; k = mn[k].Backptr) { eojeols[--i] = mn[k].Eojeol; } return new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols); }
/// <summary> Computes P(T_i, W_i) of the specified eojeol.</summary> /// <param name="eojeol">- the eojeol to compute the probability /// </param> /// <returns> P(T_i, W_i) of the specified eojeol /// </returns> private double compute_wt(Eojeol eojeol) { double current = 0.0, tbigram, tunigram, lexicon; System.String tag; System.String bitag; System.String oldtag; tag = eojeol.getTag(0); /* the probability of P(t1|t0) */ bitag = "bnk-" + tag; double[] prob = null; if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null) { /* current = P(t1|t0) */ tbigram = prob[0]; } else { /* current = P(t1|t0) = 0.01 */ tbigram = PCONSTANT; } /* the probability of P(t1) */ if ((prob = ptt_pos_tf.get_Renamed(tag)) != null) { /* current = P(t1) */ tunigram = prob[0]; } else { /* current = P(t1) = 0.01 */ tunigram = PCONSTANT; } /* the probability of P(w|t) */ if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(0) + "/" + tag)) != null) { /* current *= P(w|t1) */ lexicon = prob[0]; } else { /* current = P(w|t1) = 0.01 */ lexicon = PCONSTANT; } /* * current = P(w|t1) * P(t1|t0) ~= P(w|t1) * (P(t1|t0))^Lambda1 * (P(t1))^Lambda2 (Lambda1 + Lambda2 = 1) */ // current = lexicon + Lambda1*tbigram + Lambda2*tunigram; /* * current = P(w|t1)/P(t1) * P(t1|t0)/P(t1) */ // current = lexicon - tunigram + tbigram - tunigram; /* * current = P(w|t1) * P(t1|t0) */ // current = lexicon + tbigram ; /* * current = P(w|t1) * P(t1|t0) / P(t1) */ current = lexicon + tbigram - tunigram; oldtag = tag; for (int i = 1; i < eojeol.length; i++) { tag = eojeol.getTag(i); /* P(t_i|t_i-1) */ bitag = oldtag + "-" + tag; if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null) { tbigram = prob[0]; } else { tbigram = PCONSTANT; } /* P(w|t) */ if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(i) + "/" + tag)) != null) { /* current *= P(w|t) */ lexicon = prob[0]; } else { lexicon = PCONSTANT; } /* P(t) */ if ((prob = ptt_pos_tf.get_Renamed(tag)) != null) { /* current = P(t) */ tunigram = prob[0]; } else { /* current = P(t)=0.01 */ tunigram = PCONSTANT; } // current += lexicon - tunigram + tbigram - tunigram; // current += lexicon + tbigram; current += lexicon + tbigram - tunigram; oldtag = tag; } /* the blank at the end of eojeol */ bitag = tag + "-bnk"; /* P(bnk|t_last) */ if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null) { tbigram = prob[0]; } else { tbigram = PCONSTANT; } /* P(bnk) */ if ((prob = ptt_pos_tf.get_Renamed("bnk")) != null) { tunigram = prob[0]; } else { tunigram = PCONSTANT; } /* P(w|bnk) = 1, and ln(1) = 0 */ // current += 0 - tunigram + tbigram - tunigram; // current += 0 + tbigram; current += 0 + tbigram - tunigram; return current; }
/// <summary> Computes P(T_i, W_i) of the specified eojeol.</summary> /// <param name="eojeol">- the eojeol to compute the probability /// </param> /// <returns> P(T_i, W_i) of the specified eojeol /// </returns> private double compute_wt(Eojeol eojeol) { double current = 0.0, tbigram, tunigram, lexicon; System.String tag; System.String bitag; System.String oldtag; tag = eojeol.getTag(0); /* the probability of P(t1|t0) */ bitag = "bnk-" + tag; double[] prob = null; if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null) { /* current = P(t1|t0) */ tbigram = prob[0]; } else { /* current = P(t1|t0) = 0.01 */ tbigram = PCONSTANT; } /* the probability of P(t1) */ if ((prob = ptt_pos_tf.get_Renamed(tag)) != null) { /* current = P(t1) */ tunigram = prob[0]; } else { /* current = P(t1) = 0.01 */ tunigram = PCONSTANT; } /* the probability of P(w|t) */ if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(0) + "/" + tag)) != null) { /* current *= P(w|t1) */ lexicon = prob[0]; } else { /* current = P(w|t1) = 0.01 */ lexicon = PCONSTANT; } /* * current = P(w|t1) * P(t1|t0) ~= P(w|t1) * (P(t1|t0))^Lambda1 * (P(t1))^Lambda2 (Lambda1 + Lambda2 = 1) */ // current = lexicon + Lambda1*tbigram + Lambda2*tunigram; /* * current = P(w|t1)/P(t1) * P(t1|t0)/P(t1) */ // current = lexicon - tunigram + tbigram - tunigram; /* * current = P(w|t1) * P(t1|t0) */ // current = lexicon + tbigram ; /* * current = P(w|t1) * P(t1|t0) / P(t1) */ current = lexicon + tbigram - tunigram; oldtag = tag; for (int i = 1; i < eojeol.length; i++) { tag = eojeol.getTag(i); /* P(t_i|t_i-1) */ bitag = oldtag + "-" + tag; if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null) { tbigram = prob[0]; } else { tbigram = PCONSTANT; } /* P(w|t) */ if ((prob = pwt_pos_tf.get_Renamed(eojeol.getMorpheme(i) + "/" + tag)) != null) { /* current *= P(w|t) */ lexicon = prob[0]; } else { lexicon = PCONSTANT; } /* P(t) */ if ((prob = ptt_pos_tf.get_Renamed(tag)) != null) { /* current = P(t) */ tunigram = prob[0]; } else { /* current = P(t)=0.01 */ tunigram = PCONSTANT; } // current += lexicon - tunigram + tbigram - tunigram; // current += lexicon + tbigram; current += lexicon + tbigram - tunigram; oldtag = tag; } /* the blank at the end of eojeol */ bitag = tag + "-bnk"; /* P(bnk|t_last) */ if ((prob = ptt_pos_tf.get_Renamed(bitag)) != null) { tbigram = prob[0]; } else { tbigram = PCONSTANT; } /* P(bnk) */ if ((prob = ptt_pos_tf.get_Renamed("bnk")) != null) { tunigram = prob[0]; } else { tunigram = PCONSTANT; } /* P(w|bnk) = 1, and ln(1) = 0 */ // current += 0 - tunigram + tbigram - tunigram; // current += 0 + tbigram; current += 0 + tbigram - tunigram; return(current); }
/// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary> /// <param name="sos">- the result of morphological analysis /// </param> /// <returns> the result of morphological analysis with post processing /// </returns> public virtual SetOfSentences doPostProcessing(SetOfSentences sos) { List <Eojeol[]> eojeolSetArray = sos.getEojeolSetArray(); IEnumerator <Eojeol[]> iter = eojeolSetArray.GetEnumerator(); while (iter.MoveNext()) { Eojeol[] eojeolSet = iter.Current; System.String prevMorph = ""; for (int i = 0; i < eojeolSet.Length; i++) { Eojeol eojeol = eojeolSet[i]; System.String[] morphemes = eojeol.Morphemes; System.String[] tags = eojeol.Tags; for (int j = 0; j < eojeol.length; j++) { System.String tri = Code.toTripleString(morphemes[j]); if (tags[j].StartsWith("e")) { int prevLen = prevMorph.Length; if (tri.StartsWith(A_)) { /* 어 -> 아 */ if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4])))) { morphemes[j] = Code.toString(AR.ToCharArray()); } else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false)) { /* for 'ㅂ' irregular */ } else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA)) { } else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2])))) { // final consonant or not morphemes[j] = Code.toString(AR.ToCharArray()); } } else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0)))) { /* elision of '으', '스', '느' */ if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF)) { morphemes[j] = Code.toString(tri.Substring(2).ToCharArray()); } } } prevMorph = Code.toTripleString(morphemes[j]); } } } return(sos); }