Exemplo n.º 1
0
        override public void Run()
        {
            PlainSentence ps = null;

            try
            {
                while (true)
                {
                    ps = in_Renamed.Take();

                    if ((ps = plainTextProcessor.doProcess(ps)) != null)
                    {
                        out_Renamed.Add(ps);
                    }

                    while (plainTextProcessor.hasRemainingData())
                    {
                        if ((ps = plainTextProcessor.doProcess(null)) != null)
                        {
                            out_Renamed.Add(ps);
                        }
                    }

                    if ((ps = plainTextProcessor.flush()) != null)
                    {
                        out_Renamed.Add(ps);
                    }
                }
            }
            catch (System.Threading.ThreadInterruptedException e)
            {
                plainTextProcessor.shutdown();
            }
        }
        /// <summary> It recognizes informal sentences in which an eojeol is quite long and some characters were
        /// repeated many times. To prevent decrease of analysis performance because of those unimportant
        /// irregular pattern, it inserts some blanks in those eojeols to seperate them.
        /// </summary>
        public virtual PlainSentence doProcess(PlainSentence ps)
        {
            System.String             word = null;
            System.Text.StringBuilder buf  = new System.Text.StringBuilder();
            StringTokenizer           st   = new StringTokenizer(ps.Sentence, " \t");

            while (st.HasMoreTokens)
            {
                word = st.NextToken;

                /* repeated character */
                if (word.Length > REPEAT_CHAR_ALLOWED)
                {
                    char[] wordArray = word.ToCharArray();
                    int    repeatCnt = 0;
                    char   checkChar = wordArray[0];

                    buf.Append(checkChar);

                    for (int i = 1; i < wordArray.Length; i++)
                    {
                        if (checkChar == wordArray[i])
                        {
                            if (repeatCnt == REPEAT_CHAR_ALLOWED - 1)
                            {
                                buf.Append(' ');
                                buf.Append(wordArray[i]);
                                repeatCnt = 0;
                            }
                            else
                            {
                                buf.Append(wordArray[i]);
                                repeatCnt++;
                            }
                        }
                        else
                        {
                            if (checkChar == '.')
                            {
                                buf.Append(' ');
                            }
                            buf.Append(wordArray[i]);
                            checkChar = wordArray[i];
                            repeatCnt = 0;
                        }
                    }
                }
                else
                {
                    buf.Append(word);
                }
                buf.Append(' ');
            }
            ps.Sentence = buf.ToString();
            return(ps);
        }
        /// <summary> It recognizes informal sentences in which an eojeol is quite long and some characters were
        /// repeated many times. To prevent decrease of analysis performance because of those unimportant
        /// irregular pattern, it inserts some blanks in those eojeols to seperate them.
        /// </summary>
        public virtual PlainSentence doProcess(PlainSentence ps)
        {
            System.String word = null;
            System.Text.StringBuilder buf = new System.Text.StringBuilder();
            StringTokenizer st = new StringTokenizer(ps.Sentence, " \t");

            while (st.HasMoreTokens)
            {
                word = st.NextToken;

                /* repeated character */
                if (word.Length > REPEAT_CHAR_ALLOWED)
                {
                    char[] wordArray = word.ToCharArray();
                    int repeatCnt = 0;
                    char checkChar = wordArray[0];

                    buf.Append(checkChar);

                    for (int i = 1; i < wordArray.Length; i++)
                    {
                        if (checkChar == wordArray[i])
                        {
                            if (repeatCnt == REPEAT_CHAR_ALLOWED - 1)
                            {
                                buf.Append(' ');
                                buf.Append(wordArray[i]);
                                repeatCnt = 0;
                            }
                            else
                            {
                                buf.Append(wordArray[i]);
                                repeatCnt++;
                            }
                        }
                        else
                        {
                            if (checkChar == '.')
                            {
                                buf.Append(' ');
                            }
                            buf.Append(wordArray[i]);
                            checkChar = wordArray[i];
                            repeatCnt = 0;
                        }
                    }
                }
                else
                {
                    buf.Append(word);
                }
                buf.Append(' ');
            }
            ps.Sentence = buf.ToString();
            return ps;
        }
Exemplo n.º 4
0
        override public void Run()
        {
            PlainSentence  ps  = null;
            SetOfSentences sos = null;

            try
            {
                while (true)
                {
                    ps = in_Renamed.Take();

                    if ((sos = ma.morphAnalyze(ps)) != null)
                    {
                        out_Renamed.Add(sos);
                    }
                }
            }
            catch (System.Threading.ThreadInterruptedException e)
            {
                ma.shutdown();
            }
        }
Exemplo n.º 5
0
        /// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary>
        /// <returns> all the possible morphological analysis results
        /// </returns>
        public virtual SetOfSentences morphAnalyze(PlainSentence ps)
        {
            StringTokenizer st = new StringTokenizer(ps.Sentence, " \t");

            System.String plainEojeol = null;
            int           eojeolNum   = st.Count;

            List <String>    plainEojeolArray = new List <String>(eojeolNum);
            List <Eojeol []> eojeolSetArray   = new List <Eojeol []>(eojeolNum);

            while (st.HasMoreTokens)
            {
                plainEojeol = st.NextToken;

                plainEojeolArray.Add(plainEojeol);
                eojeolSetArray.Add(processEojeol(plainEojeol));
            }

            SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray);

            sos = postProc.doPostProcessing(sos);

            return(sos);
        }
Exemplo n.º 6
0
        /// <summary> It recognizes the end of each sentence and return the first sentence.</summary>
        /// <param name="ps">- the plain sentence which can consist of several sentences
        /// </param>
        /// <returns> the first sentence recognized 
        /// </returns>
        public virtual PlainSentence doProcess(PlainSentence ps)
        {
            System.String[] eojeols = null;
            System.String res = null;
            bool isFirstEojeol = true;
            bool isEOS = false;
            int i = 0;
            int j = 0;

            if (bufEojeols != null)
            {
                eojeols = bufEojeols;
                i = bufEojeolsIdx;

                bufEojeols = null;
                bufEojeolsIdx = 0;
            }
            else
            {
                if (ps == null)
                {
                    return null;
                }

                if (documentID != ps.DocumentID)
                {
                    documentID = ps.DocumentID;
                    sentenceID = 0;
                }

                System.String str = null;
                if ((str = ps.Sentence) == null)
                {
                    return null;
                }
                eojeols = str.Split("\\s");

                endOfDocument = ps.EndOfDocument;
            }

            for (; isEOS == false && i < eojeols.Length; i++)
            {
                if (!eojeols[i].Matches(".*(\\.|\\!|\\?).*"))
                {
                    // the eojeol doesn't have '.', '!', '?'
                    if (isFirstEojeol)
                    {
                        res = eojeols[i];
                        isFirstEojeol = false;
                    }
                    else
                    {
                        res += (" " + eojeols[i]);
                    }
                }
                else
                {
                    // the eojeol has '.', '!', '?'
                    char[] ca = eojeols[i].ToCharArray();

                    for (j = 0; isEOS == false && j < ca.Length; j++)
                    {
                        switch (ca[j])
                        {

                            case '.':
                                if (j == 1)
                                {
                                    // ellipsis
                                    continue;
                                }
                                if (j > 0)
                                {
                                    // abbreviation
                                    if (System.Char.IsLower(ca[j - 1]) || System.Char.IsUpper(ca[j - 1]))
                                    {
                                        continue;
                                    }
                                }
                                if (j < ca.Length - 1)
                                {
                                    // number
                                    if (System.Char.IsDigit(ca[j + 1]))
                                    {
                                        continue;
                                    }
                                }
                                isEOS = true;
                                break;

                            case '!':
                                isEOS = true;
                                break;

                            case '?':
                                isEOS = true;
                                break;
                        }

                        if (isEOS)
                        {
                            if (isFirstEojeol)
                            {
                                res = eojeols[i].Substring(0, (j) - (0)) + " " + ca[j];
                                isFirstEojeol = false;
                            }
                            else
                            {
                                res += (" " + eojeols[i].Substring(0, (j) - (0)) + " " + ca[j]);
                            }

                            // a sequence of symbols such as '...', '?!!'
                            while (j < ca.Length - 1)
                            {
                                if (isSym(ca[j + 1]))
                                {
                                    j++;
                                    res += ca[j];
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                    }
                    if (isEOS == false)
                    {
                        if (isFirstEojeol)
                        {
                            res = eojeols[i];
                            isFirstEojeol = false;
                        }
                        else
                        {
                            res += (" " + eojeols[i]);
                        }
                    }
                }
            }

            i--;
            j--;

            if (isEOS)
            {
                // the remaining part of an eojeol after the end of sentence is stored in the buffer
                if (j + 1 < eojeols[i].Length)
                {
                    eojeols[i] = eojeols[i].Substring(j + 1);
                    bufEojeols = eojeols;
                    bufEojeolsIdx = i;
                    hasRemainingData_Renamed_Field = true;
                }
                else
                {
                    if (i == eojeols.Length - 1)
                    {
                        // all eojeols were processed
                        hasRemainingData_Renamed_Field = false;
                    }
                    else
                    {
                        // if there were some eojeols not processed, they were stored in the buffer
                        bufEojeols = eojeols;
                        bufEojeolsIdx = i + 1;
                        hasRemainingData_Renamed_Field = true;
                    }
                }

                if (bufRes == null)
                {
                    return new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res);
                }
                else
                {
                    res = bufRes + " " + res;
                    bufRes = null;
                    return new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res);
                }
            }
            else
            {
                if (res != null && res.Length > 0)
                {
                    bufRes = res;
                }
                hasRemainingData_Renamed_Field = false;
                return null;
            }
        }
Exemplo n.º 7
0
        /// <summary> Analyzes the specified plain sentence, and returns all the possible analysis results.</summary>
        /// <returns> all the possible morphological analysis results
        /// </returns>
        public virtual SetOfSentences morphAnalyze(PlainSentence ps)
        {
            StringTokenizer st = new StringTokenizer(ps.Sentence, " \t");

            System.String plainEojeol = null;
            int eojeolNum = st.Count;

            List< String > plainEojeolArray = new List< String >(eojeolNum);
            List< Eojeol [] > eojeolSetArray = new List< Eojeol [] >(eojeolNum);

            while (st.HasMoreTokens)
            {
                plainEojeol = st.NextToken;

                plainEojeolArray.Add(plainEojeol);
                eojeolSetArray.Add(processEojeol(plainEojeol));
            }

            SetOfSentences sos = new SetOfSentences(ps.DocumentID, ps.SentenceID, ps.EndOfDocument, plainEojeolArray, eojeolSetArray);

            sos = postProc.doPostProcessing(sos);

            return sos;
        }
Exemplo n.º 8
0
        /// <summary> It recognizes the end of each sentence and return the first sentence.</summary>
        /// <param name="ps">- the plain sentence which can consist of several sentences
        /// </param>
        /// <returns> the first sentence recognized
        /// </returns>
        public virtual PlainSentence doProcess(PlainSentence ps)
        {
            System.String[] eojeols       = null;
            System.String   res           = null;
            bool            isFirstEojeol = true;
            bool            isEOS         = false;
            int             i             = 0;
            int             j             = 0;

            if (bufEojeols != null)
            {
                eojeols = bufEojeols;
                i       = bufEojeolsIdx;

                bufEojeols    = null;
                bufEojeolsIdx = 0;
            }
            else
            {
                if (ps == null)
                {
                    return(null);
                }

                if (documentID != ps.DocumentID)
                {
                    documentID = ps.DocumentID;
                    sentenceID = 0;
                }

                System.String str = null;
                if ((str = ps.Sentence) == null)
                {
                    return(null);
                }
                eojeols = str.Split("\\s");

                endOfDocument = ps.EndOfDocument;
            }

            for (; isEOS == false && i < eojeols.Length; i++)
            {
                if (!eojeols[i].Matches(".*(\\.|\\!|\\?).*"))
                {
                    // the eojeol doesn't have '.', '!', '?'
                    if (isFirstEojeol)
                    {
                        res           = eojeols[i];
                        isFirstEojeol = false;
                    }
                    else
                    {
                        res += (" " + eojeols[i]);
                    }
                }
                else
                {
                    // the eojeol has '.', '!', '?'
                    char[] ca = eojeols[i].ToCharArray();

                    for (j = 0; isEOS == false && j < ca.Length; j++)
                    {
                        switch (ca[j])
                        {
                        case '.':
                            if (j == 1)
                            {
                                // ellipsis
                                continue;
                            }
                            if (j > 0)
                            {
                                // abbreviation
                                if (System.Char.IsLower(ca[j - 1]) || System.Char.IsUpper(ca[j - 1]))
                                {
                                    continue;
                                }
                            }
                            if (j < ca.Length - 1)
                            {
                                // number
                                if (System.Char.IsDigit(ca[j + 1]))
                                {
                                    continue;
                                }
                            }
                            isEOS = true;
                            break;

                        case '!':
                            isEOS = true;
                            break;

                        case '?':
                            isEOS = true;
                            break;
                        }

                        if (isEOS)
                        {
                            if (isFirstEojeol)
                            {
                                res           = eojeols[i].Substring(0, (j) - (0)) + " " + ca[j];
                                isFirstEojeol = false;
                            }
                            else
                            {
                                res += (" " + eojeols[i].Substring(0, (j) - (0)) + " " + ca[j]);
                            }

                            // a sequence of symbols such as '...', '?!!'
                            while (j < ca.Length - 1)
                            {
                                if (isSym(ca[j + 1]))
                                {
                                    j++;
                                    res += ca[j];
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                    }
                    if (isEOS == false)
                    {
                        if (isFirstEojeol)
                        {
                            res           = eojeols[i];
                            isFirstEojeol = false;
                        }
                        else
                        {
                            res += (" " + eojeols[i]);
                        }
                    }
                }
            }

            i--;
            j--;

            if (isEOS)
            {
                // the remaining part of an eojeol after the end of sentence is stored in the buffer
                if (j + 1 < eojeols[i].Length)
                {
                    eojeols[i]    = eojeols[i].Substring(j + 1);
                    bufEojeols    = eojeols;
                    bufEojeolsIdx = i;
                    hasRemainingData_Renamed_Field = true;
                }
                else
                {
                    if (i == eojeols.Length - 1)
                    {
                        // all eojeols were processed
                        hasRemainingData_Renamed_Field = false;
                    }
                    else
                    {
                        // if there were some eojeols not processed, they were stored in the buffer
                        bufEojeols    = eojeols;
                        bufEojeolsIdx = i + 1;
                        hasRemainingData_Renamed_Field = true;
                    }
                }

                if (bufRes == null)
                {
                    return(new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res));
                }
                else
                {
                    res    = bufRes + " " + res;
                    bufRes = null;
                    return(new PlainSentence(documentID, sentenceID++, !hasRemainingData_Renamed_Field && endOfDocument, res));
                }
            }
            else
            {
                if (res != null && res.Length > 0)
                {
                    bufRes = res;
                }
                hasRemainingData_Renamed_Field = false;
                return(null);
            }
        }