/// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary> /// <param name="sos">- all the candidates of morphological analysis /// </param> /// <returns> the final morphological analysis result which has the highest probability /// </returns> private Sentence end_sentence(SetOfSentences sos) { int i, j, k; /* Ceartes the last node */ i = new_wp(" "); wp[i].MNode = new_mnode(null, "SF", 0); /* Runs viterbi */ for (i = 1; i < wp_end - 1; i++) { for (j = wp[i].MNode; j != 0; j = mn[j].Sibling) { for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling) { update_prob_score(j, k); } } } i = sos.length; Eojeol[] eojeols = new Eojeol[i]; for (k = wp[i].MNode; k != 0; k = mn[k].Backptr) { eojeols[--i] = mn[k].Eojeol; } return(new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols)); }
public virtual Sentence tagPOS(SetOfSentences sos) { int v = 0, prev_v = 0, w = 0; List <string> plainEojeolArray = sos.getPlainEojeolArray(); List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray(); // initialization reset(); IEnumerator <string> plainEojeolIter = plainEojeolArray.GetEnumerator(); foreach (Eojeol [] eojeolSet in eojeolSetArray) { System.String plainEojeol = null; if (plainEojeolIter.MoveNext()) { plainEojeol = plainEojeolIter.Current; } else { break; } w = new_wp(plainEojeol); for (int i = 0; i < eojeolSet.Length; i++) { System.String now_tag; double probability; now_tag = PhraseTag.getPhraseTag(eojeolSet[i].Tags); probability = compute_wt(eojeolSet[i]); v = new_mnode(eojeolSet[i], now_tag, probability); if (i == 0) { wp[w].MNode = v; prev_v = v; } else { mn[prev_v].Sibling = v; prev_v = v; } } } // gets the final result by running viterbi return(end_sentence(sos)); }
public virtual SetOfSentences doProcess(SetOfSentences sos) { List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray(); LinkedList <Eojeol> eojeolArray = new LinkedList <Eojeol>(); for (int i = 0; i < eojeolSetArray.Count; i++) { Eojeol[] eojeolSet = eojeolSetArray[i]; eojeolArray.Clear(); for (int j = 0; j < eojeolSet.Length; j++) { eojeolArray.AddLast(eojeolSet[j]); } int unkCount = 0; for (int j = 0; j < eojeolArray.Count; j++) { Eojeol eojeol = eojeolArray.Get_Renamed(j); System.String[] tags = eojeol.Tags; System.String[] morphemes = eojeol.Morphemes; for (int k = 0; k < tags.Length; k++) { if (tags[k].Equals("unk")) { tags[k] = "nqq"; Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]); eojeolArray.AddLast(newEojeol); tags[k] = "ncn"; unkCount++; } } } if (unkCount > 0) { eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet); } } return(sos); }
public virtual SetOfSentences doProcess(SetOfSentences sos) { List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray(); LinkedList < Eojeol > eojeolArray = new LinkedList < Eojeol >(); for (int i = 0; i < eojeolSetArray.Count; i++) { Eojeol[] eojeolSet = eojeolSetArray[i]; eojeolArray.Clear(); for (int j = 0; j < eojeolSet.Length; j++) { eojeolArray.AddLast(eojeolSet[j]); } int unkCount = 0; for (int j = 0; j < eojeolArray.Count; j++) { Eojeol eojeol = eojeolArray.Get_Renamed(j); System.String[] tags = eojeol.Tags; System.String[] morphemes = eojeol.Morphemes; for (int k = 0; k < tags.Length; k++) { if (tags[k].Equals("unk")) { tags[k] = "nqq"; Eojeol newEojeol = new Eojeol(morphemes.Clone() as string[], tags.Clone() as string[]); eojeolArray.AddLast(newEojeol); tags[k] = "ncn"; unkCount++; } } } if (unkCount > 0) { eojeolSetArray[i] = eojeolArray.ToArray(eojeolSet); } } return sos; }
override public void Run() { SetOfSentences sos = null; try { while (true) { sos = in_Renamed.Take(); if ((sos = morphProcessor.doProcess(sos)) != null) { out_Renamed.Add(sos); } } } catch (System.Threading.ThreadInterruptedException e) { morphProcessor.shutdown(); } }
override public void Run() { SetOfSentences sos = null; Sentence sent = null; try { while (true) { sos = in_Renamed.Take(); if ((sent = tagger.tagPOS(sos)) != null) { out_Renamed.Add(sent); } } } catch (System.Threading.ThreadInterruptedException e) { tagger.shutdown(); } }
override public void Run() { PlainSentence ps = null; SetOfSentences sos = null; try { while (true) { ps = in_Renamed.Take(); if ((sos = ma.morphAnalyze(ps)) != null) { out_Renamed.Add(sos); } } } catch (System.Threading.ThreadInterruptedException e) { ma.shutdown(); } }
/// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary> /// <returns> all the possible morphological analysis results /// </returns> public virtual SetOfSentences morphAnalyze(PlainSentence ps) { StringTokenizer st = new StringTokenizer(ps.Sentence, " \t"); System.String plainEojeol = null; int eojeolNum = st.Count; List <String> plainEojeolArray = new List <String>(eojeolNum); List <Eojeol []> eojeolSetArray = new List <Eojeol []>(eojeolNum); while (st.HasMoreTokens) { plainEojeol = st.NextToken; plainEojeolArray.Add(plainEojeol); eojeolSetArray.Add(processEojeol(plainEojeol)); } SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray); sos = postProc.doPostProcessing(sos); return(sos); }
/// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary> /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result /// </param> /// <returns> the simplified morphological analysis result /// </returns> public virtual SetOfSentences doProcess(SetOfSentences sos) { List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray(); List< Eojeol [] > resultSetArray = new List< Eojeol [] >(); int len = eojeolSetArray.Count; System.String prevTag = null; bool changed = false; for (int pos = 0; pos < len; pos++) { Eojeol[] eojeolSet = eojeolSetArray[pos]; dupFilterMap.Clear(); for (int i = 0; i < eojeolSet.Length; i++) { System.String[] tags = eojeolSet[i].Tags; prevTag = ""; changed = false; for (int j = 0; j < tags.Length; j++) { tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL); if (tags[j].Equals(prevTag)) { changed = true; } prevTag = tags[j]; } if (changed) { tagList.Clear(); morphemeList.Clear(); System.String[] morphemes = eojeolSet[i].Morphemes; for (int j = 0; j < tags.Length - 1; j++) { if (tags[j].Equals(tags[j + 1])) { morphemes[j + 1] = morphemes[j] + morphemes[j + 1]; } else { tagList.Add(tags[j]); morphemeList.Add(morphemes[j]); } } tagList.Add(tags[tags.Length - 1]); morphemeList.Add(morphemes[morphemes.Length - 1]); eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray()); } System.String key = eojeolSet[i].ToString(); if (!dupFilterMap.ContainsKey(key)) { dupFilterMap[key] = eojeolSet[i]; } } if (eojeolSet.Length != dupFilterMap.Count) { resultSetArray.Add(dupFilterMap.Values.ToArray()); } else { resultSetArray.Add(eojeolSet); } } sos.setEojeolSetArray(resultSetArray); return sos; }
/// <summary> It changes the morphological analysis result with 69 KAIST tags to the simplified result with 22 tags.</summary> /// <param name="sos">- the result of morphological analysis where each eojeol has more than analysis result /// </param> /// <returns> the simplified morphological analysis result /// </returns> public virtual SetOfSentences doProcess(SetOfSentences sos) { List <Eojeol []> eojeolSetArray = sos.getEojeolSetArray(); List <Eojeol []> resultSetArray = new List <Eojeol []>(); int len = eojeolSetArray.Count; System.String prevTag = null; bool changed = false; for (int pos = 0; pos < len; pos++) { Eojeol[] eojeolSet = eojeolSetArray[pos]; dupFilterMap.Clear(); for (int i = 0; i < eojeolSet.Length; i++) { System.String[] tags = eojeolSet[i].Tags; prevTag = ""; changed = false; for (int j = 0; j < tags.Length; j++) { tags[j] = TagMapper.getKaistTagOnLevel(tags[j], TAG_LEVEL); if (tags[j].Equals(prevTag)) { changed = true; } prevTag = tags[j]; } if (changed) { tagList.Clear(); morphemeList.Clear(); System.String[] morphemes = eojeolSet[i].Morphemes; for (int j = 0; j < tags.Length - 1; j++) { if (tags[j].Equals(tags[j + 1])) { morphemes[j + 1] = morphemes[j] + morphemes[j + 1]; } else { tagList.Add(tags[j]); morphemeList.Add(morphemes[j]); } } tagList.Add(tags[tags.Length - 1]); morphemeList.Add(morphemes[morphemes.Length - 1]); eojeolSet[i] = new Eojeol(morphemeList.ToArray(), tagList.ToArray()); } System.String key = eojeolSet[i].ToString(); if (!dupFilterMap.ContainsKey(key)) { dupFilterMap[key] = eojeolSet[i]; } } if (eojeolSet.Length != dupFilterMap.Count) { resultSetArray.Add(dupFilterMap.Values.ToArray()); } else { resultSetArray.Add(eojeolSet); } } sos.setEojeolSetArray(resultSetArray); return(sos); }
/// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary> /// <returns> all the possible morphological analysis results /// </returns> public virtual SetOfSentences morphAnalyze(PlainSentence ps) { StringTokenizer st = new StringTokenizer(ps.Sentence, " \t"); System.String plainEojeol = null; int eojeolNum = st.Count; List< String > plainEojeolArray = new List< String >(eojeolNum); List< Eojeol [] > eojeolSetArray = new List< Eojeol [] >(eojeolNum); while (st.HasMoreTokens) { plainEojeol = st.NextToken; plainEojeolArray.Add(plainEojeol); eojeolSetArray.Add(processEojeol(plainEojeol)); } SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray); sos = postProc.doPostProcessing(sos); return sos; }
/// <summary> Runs viterbi to get the final morphological analysis result which has the highest probability.</summary> /// <param name="sos">- all the candidates of morphological analysis /// </param> /// <returns> the final morphological analysis result which has the highest probability /// </returns> private Sentence end_sentence(SetOfSentences sos) { int i, j, k; /* Ceartes the last node */ i = new_wp(" "); wp[i].MNode = new_mnode(null, "SF", 0); /* Runs viterbi */ for (i = 1; i < wp_end - 1; i++) { for (j = wp[i].MNode; j != 0; j = mn[j].Sibling) { for (k = wp[i + 1].MNode; k != 0; k = mn[k].Sibling) { update_prob_score(j, k); } } } i = sos.length; Eojeol[] eojeols = new Eojeol[i]; for (k = wp[i].MNode; k != 0; k = mn[k].Backptr) { eojeols[--i] = mn[k].Eojeol; } return new Sentence(sos.DocumentID, sos.SentenceID, sos.EndOfDocument, sos.getPlainEojeolArray().ToArray(), eojeols); }
public virtual Sentence tagPOS(SetOfSentences sos) { int v = 0, prev_v = 0, w = 0; List<string> plainEojeolArray = sos.getPlainEojeolArray(); List< Eojeol [] > eojeolSetArray = sos.getEojeolSetArray(); // initialization reset(); IEnumerator<string> plainEojeolIter = plainEojeolArray.GetEnumerator(); foreach (Eojeol [] eojeolSet in eojeolSetArray) { System.String plainEojeol = null; if (plainEojeolIter.MoveNext()) { plainEojeol = plainEojeolIter.Current; } else { break; } w = new_wp(plainEojeol); for (int i = 0; i < eojeolSet.Length; i++) { System.String now_tag; double probability; now_tag = PhraseTag.getPhraseTag(eojeolSet[i].Tags); probability = compute_wt(eojeolSet[i]); v = new_mnode(eojeolSet[i], now_tag, probability); if (i == 0) { wp[w].MNode = v; prev_v = v; } else { mn[prev_v].Sibling = v; prev_v = v; } } } // gets the final result by running viterbi return end_sentence(sos); }
/// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary> /// <param name="sos">- the result of morphological analysis /// </param> /// <returns> the result of morphological analysis with post processing /// </returns> public virtual SetOfSentences doPostProcessing(SetOfSentences sos) { List<Eojeol[]> eojeolSetArray = sos.getEojeolSetArray(); IEnumerator<Eojeol[]> iter = eojeolSetArray.GetEnumerator(); while (iter.MoveNext()) { Eojeol[] eojeolSet = iter.Current; System.String prevMorph = ""; for (int i = 0; i < eojeolSet.Length; i++) { Eojeol eojeol = eojeolSet[i]; System.String[] morphemes = eojeol.Morphemes; System.String[] tags = eojeol.Tags; for (int j = 0; j < eojeol.length; j++) { System.String tri = Code.toTripleString(morphemes[j]); if (tags[j].StartsWith("e")) { int prevLen = prevMorph.Length; if (tri.StartsWith(A_)) { /* 어 -> 아 */ if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4])))) { morphemes[j] = Code.toString(AR.ToCharArray()); } else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false)) { /* for 'ㅂ' irregular */ } else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA)) { } else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2])))) { // final consonant or not morphemes[j] = Code.toString(AR.ToCharArray()); } } else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0)))) { /* elision of '으', '스', '느' */ if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF)) { morphemes[j] = Code.toString(tri.Substring(2).ToCharArray()); } } } prevMorph = Code.toTripleString(morphemes[j]); } } } return sos; }
/// <summary> It does post processing of morphological analysis to deal with some exceptions.</summary> /// <param name="sos">- the result of morphological analysis /// </param> /// <returns> the result of morphological analysis with post processing /// </returns> public virtual SetOfSentences doPostProcessing(SetOfSentences sos) { List <Eojeol[]> eojeolSetArray = sos.getEojeolSetArray(); IEnumerator <Eojeol[]> iter = eojeolSetArray.GetEnumerator(); while (iter.MoveNext()) { Eojeol[] eojeolSet = iter.Current; System.String prevMorph = ""; for (int i = 0; i < eojeolSet.Length; i++) { Eojeol eojeol = eojeolSet[i]; System.String[] morphemes = eojeol.Morphemes; System.String[] tags = eojeol.Tags; for (int j = 0; j < eojeol.length; j++) { System.String tri = Code.toTripleString(morphemes[j]); if (tags[j].StartsWith("e")) { int prevLen = prevMorph.Length; if (tri.StartsWith(A_)) { /* 어 -> 아 */ if (prevLen >= 4 && prevMorph[prevLen - 1] == EU[1] && !isXEU(prevMorph[prevLen - 2]) && ((Code.isJungseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 3])) || (Code.isJongseong(prevMorph[prevLen - 3]) && isPV(prevMorph[prevLen - 4])))) { morphemes[j] = Code.toString(AR.ToCharArray()); } else if (prevLen >= 3 && prevMorph[prevLen - 1] == DOB[2] && (prevMorph.Substring(prevLen - 3).Equals(DOB) == false || prevMorph.Substring(prevLen - 3).Equals(GOB) == false)) { /* for 'ㅂ' irregular */ } else if (prevLen >= 2 && prevMorph.Substring(prevLen - 2).Equals(HA)) { } else if (prevLen >= 2 && ((Code.isJungseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 1])) || (Code.isJongseong(prevMorph[prevLen - 1]) && isPV(prevMorph[prevLen - 2])))) { // final consonant or not morphemes[j] = Code.toString(AR.ToCharArray()); } } else if (tri.StartsWith(EU.Substring(0, (2) - (0))) || tri.StartsWith(SU.Substring(0, (4) - (0))) || tri.StartsWith(NU.Substring(0, (4) - (0)))) { /* elision of '으', '스', '느' */ if (prevLen >= 2 && (Code.isJungseong(prevMorph[prevLen - 1]) || prevMorph[prevLen - 1] == 0x11AF)) { morphemes[j] = Code.toString(tri.Substring(2).ToCharArray()); } } } prevMorph = Code.toTripleString(morphemes[j]); } } } return(sos); }