override public void Run() { PlainSentence ps = null; try { while (true) { ps = in_Renamed.Take(); if ((ps = plainTextProcessor.doProcess(ps)) != null) { out_Renamed.Add(ps); } while (plainTextProcessor.hasRemainingData()) { if ((ps = plainTextProcessor.doProcess(null)) != null) { out_Renamed.Add(ps); } } if ((ps = plainTextProcessor.flush()) != null) { out_Renamed.Add(ps); } } } catch (System.Threading.ThreadInterruptedException e) { plainTextProcessor.shutdown(); } }
/// <summary> It recognizes informal sentences in which an eojeol is quite long and some characters were /// repeated many times. To prevent decrease of analysis performance because of those unimportant /// irregular pattern, it inserts some blanks in those eojeols to seperate them. /// </summary> public virtual PlainSentence doProcess(PlainSentence ps) { System.String word = null; System.Text.StringBuilder buf = new System.Text.StringBuilder(); StringTokenizer st = new StringTokenizer(ps.Sentence, " \t"); while (st.HasMoreTokens) { word = st.NextToken; /* repeated character */ if (word.Length > REPEAT_CHAR_ALLOWED) { char[] wordArray = word.ToCharArray(); int repeatCnt = 0; char checkChar = wordArray[0]; buf.Append(checkChar); for (int i = 1; i < wordArray.Length; i++) { if (checkChar == wordArray[i]) { if (repeatCnt == REPEAT_CHAR_ALLOWED - 1) { buf.Append(' '); buf.Append(wordArray[i]); repeatCnt = 0; } else { buf.Append(wordArray[i]); repeatCnt++; } } else { if (checkChar == '.') { buf.Append(' '); } buf.Append(wordArray[i]); checkChar = wordArray[i]; repeatCnt = 0; } } } else { buf.Append(word); } buf.Append(' '); } ps.Sentence = buf.ToString(); return(ps); }
/// <summary> It recognizes informal sentences in which an eojeol is quite long and some characters were /// repeated many times. To prevent decrease of analysis performance because of those unimportant /// irregular pattern, it inserts some blanks in those eojeols to seperate them. /// </summary> public virtual PlainSentence doProcess(PlainSentence ps) { System.String word = null; System.Text.StringBuilder buf = new System.Text.StringBuilder(); StringTokenizer st = new StringTokenizer(ps.Sentence, " \t"); while (st.HasMoreTokens) { word = st.NextToken; /* repeated character */ if (word.Length > REPEAT_CHAR_ALLOWED) { char[] wordArray = word.ToCharArray(); int repeatCnt = 0; char checkChar = wordArray[0]; buf.Append(checkChar); for (int i = 1; i < wordArray.Length; i++) { if (checkChar == wordArray[i]) { if (repeatCnt == REPEAT_CHAR_ALLOWED - 1) { buf.Append(' '); buf.Append(wordArray[i]); repeatCnt = 0; } else { buf.Append(wordArray[i]); repeatCnt++; } } else { if (checkChar == '.') { buf.Append(' '); } buf.Append(wordArray[i]); checkChar = wordArray[i]; repeatCnt = 0; } } } else { buf.Append(word); } buf.Append(' '); } ps.Sentence = buf.ToString(); return ps; }
override public void Run() { PlainSentence ps = null; SetOfSentences sos = null; try { while (true) { ps = in_Renamed.Take(); if ((sos = ma.morphAnalyze(ps)) != null) { out_Renamed.Add(sos); } } } catch (System.Threading.ThreadInterruptedException e) { ma.shutdown(); } }
/// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary> /// <returns> all the possible morphological analysis results /// </returns> public virtual SetOfSentences morphAnalyze(PlainSentence ps) { StringTokenizer st = new StringTokenizer(ps.Sentence, " \t"); System.String plainEojeol = null; int eojeolNum = st.Count; List <String> plainEojeolArray = new List <String>(eojeolNum); List <Eojeol []> eojeolSetArray = new List <Eojeol []>(eojeolNum); while (st.HasMoreTokens) { plainEojeol = st.NextToken; plainEojeolArray.Add(plainEojeol); eojeolSetArray.Add(processEojeol(plainEojeol)); } SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray); sos = postProc.doPostProcessing(sos); return(sos); }
/// <summary> It recognizes the end of each sentence and return the first sentence.</summary> /// <param name="ps">- the plain sentence which can consist of several sentences /// </param> /// <returns> the first sentence recognized /// </returns> public virtual PlainSentence doProcess(PlainSentence ps) { System.String[] eojeols = null; System.String res = null; bool isFirstEojeol = true; bool isEOS = false; int i = 0; int j = 0; if (bufEojeols != null) { eojeols = bufEojeols; i = bufEojeolsIdx; bufEojeols = null; bufEojeolsIdx = 0; } else { if (ps == null) { return null; } if (documentID != ps.DocumentID) { documentID = ps.DocumentID; sentenceID = 0; } System.String str = null; if ((str = ps.Sentence) == null) { return null; } eojeols = str.Split("\\s"); endOfDocument = ps.EndOfDocument; } for (; isEOS == false && i < eojeols.Length; i++) { if (!eojeols[i].Matches(".*(\\.|\\!|\\?).*")) { // the eojeol doesn't have '.', '!', '?' if (isFirstEojeol) { res = eojeols[i]; isFirstEojeol = false; } else { res += (" " + eojeols[i]); } } else { // the eojeol has '.', '!', '?' char[] ca = eojeols[i].ToCharArray(); for (j = 0; isEOS == false && j < ca.Length; j++) { switch (ca[j]) { case '.': if (j == 1) { // ellipsis continue; } if (j > 0) { // abbreviation if (System.Char.IsLower(ca[j - 1]) || System.Char.IsUpper(ca[j - 1])) { continue; } } if (j < ca.Length - 1) { // number if (System.Char.IsDigit(ca[j + 1])) { continue; } } isEOS = true; break; case '!': isEOS = true; break; case '?': isEOS = true; break; } if (isEOS) { if (isFirstEojeol) { res = eojeols[i].Substring(0, (j) - (0)) + " " + ca[j]; isFirstEojeol = false; } else { res += (" " + eojeols[i].Substring(0, (j) - (0)) + " " + ca[j]); } // a sequence of symbols such as '...', '?!!' while (j < ca.Length - 1) { if (isSym(ca[j + 1])) { j++; res += ca[j]; } else { break; } } } } if (isEOS == false) { if (isFirstEojeol) { res = eojeols[i]; isFirstEojeol = false; } else { res += (" " + eojeols[i]); } } } } i--; j--; if (isEOS) { // the remaining part of an eojeol after the end of sentence is stored in the buffer if (j + 1 < eojeols[i].Length) { eojeols[i] = eojeols[i].Substring(j + 1); bufEojeols = eojeols; bufEojeolsIdx = i; hasRemainingData_Renamed_Field = true; } else { if (i == eojeols.Length - 1) { // all eojeols were processed hasRemainingData_Renamed_Field = false; } else { // if there were some eojeols not processed, they were stored in the buffer bufEojeols = eojeols; bufEojeolsIdx = i + 1; hasRemainingData_Renamed_Field = true; } } if (bufRes == null) { return new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res); } else { res = bufRes + " " + res; bufRes = null; return new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res); } } else { if (res != null && res.Length > 0) { bufRes = res; } hasRemainingData_Renamed_Field = false; return null; } }
/// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary> /// <returns> all the possible morphological analysis results /// </returns> public virtual SetOfSentences morphAnalyze(PlainSentence ps) { StringTokenizer st = new StringTokenizer(ps.Sentence, " \t"); System.String plainEojeol = null; int eojeolNum = st.Count; List< String > plainEojeolArray = new List< String >(eojeolNum); List< Eojeol [] > eojeolSetArray = new List< Eojeol [] >(eojeolNum); while (st.HasMoreTokens) { plainEojeol = st.NextToken; plainEojeolArray.Add(plainEojeol); eojeolSetArray.Add(processEojeol(plainEojeol)); } SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray); sos = postProc.doPostProcessing(sos); return sos; }
/// <summary> It recognizes the end of each sentence and return the first sentence.</summary> /// <param name="ps">- the plain sentence which can consist of several sentences /// </param> /// <returns> the first sentence recognized /// </returns> public virtual PlainSentence doProcess(PlainSentence ps) { System.String[] eojeols = null; System.String res = null; bool isFirstEojeol = true; bool isEOS = false; int i = 0; int j = 0; if (bufEojeols != null) { eojeols = bufEojeols; i = bufEojeolsIdx; bufEojeols = null; bufEojeolsIdx = 0; } else { if (ps == null) { return(null); } if (documentID != ps.DocumentID) { documentID = ps.DocumentID; sentenceID = 0; } System.String str = null; if ((str = ps.Sentence) == null) { return(null); } eojeols = str.Split("\\s"); endOfDocument = ps.EndOfDocument; } for (; isEOS == false && i < eojeols.Length; i++) { if (!eojeols[i].Matches(".*(\\.|\\!|\\?).*")) { // the eojeol doesn't have '.', '!', '?' if (isFirstEojeol) { res = eojeols[i]; isFirstEojeol = false; } else { res += (" " + eojeols[i]); } } else { // the eojeol has '.', '!', '?' char[] ca = eojeols[i].ToCharArray(); for (j = 0; isEOS == false && j < ca.Length; j++) { switch (ca[j]) { case '.': if (j == 1) { // ellipsis continue; } if (j > 0) { // abbreviation if (System.Char.IsLower(ca[j - 1]) || System.Char.IsUpper(ca[j - 1])) { continue; } } if (j < ca.Length - 1) { // number if (System.Char.IsDigit(ca[j + 1])) { continue; } } isEOS = true; break; case '!': isEOS = true; break; case '?': isEOS = true; break; } if (isEOS) { if (isFirstEojeol) { res = eojeols[i].Substring(0, (j) - (0)) + " " + ca[j]; isFirstEojeol = false; } else { res += (" " + eojeols[i].Substring(0, (j) - (0)) + " " + ca[j]); } // a sequence of symbols such as '...', '?!!' while (j < ca.Length - 1) { if (isSym(ca[j + 1])) { j++; res += ca[j]; } else { break; } } } } if (isEOS == false) { if (isFirstEojeol) { res = eojeols[i]; isFirstEojeol = false; } else { res += (" " + eojeols[i]); } } } } i--; j--; if (isEOS) { // the remaining part of an eojeol after the end of sentence is stored in the buffer if (j + 1 < eojeols[i].Length) { eojeols[i] = eojeols[i].Substring(j + 1); bufEojeols = eojeols; bufEojeolsIdx = i; hasRemainingData_Renamed_Field = true; } else { if (i == eojeols.Length - 1) { // all eojeols were processed hasRemainingData_Renamed_Field = false; } else { // if there were some eojeols not processed, they were stored in the buffer bufEojeols = eojeols; bufEojeolsIdx = i + 1; hasRemainingData_Renamed_Field = true; } } if (bufRes == null) { return(new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res)); } else { res = bufRes + " " + res; bufRes = null; return(new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res)); } } else { if (res != null && res.Length > 0) { bufRes = res; } hasRemainingData_Renamed_Field = false; return(null); } }