Exemple #1
0
        /// <summary>
        /// 预分词
        /// </summary>
        /// <param name="str">要分词的句子</param>
        /// <returns>预分词后的字符串输出</returns>
        private List <String> PreSegment(String str)
        {
            ArrayList initSeg = new ArrayList();


            if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg))
            {
                return(new List <String>());
            }

            List <String> retWords = new List <String>();

            int i = 0;

            _ExtractWords.MatchDirection = MatchDirection;

            while (i < initSeg.Count)
            {
                String word = (String)initSeg[i];
                if (word == "")
                {
                    word = " ";
                }

                if (i < initSeg.Count - 1)
                {
                    bool mergeOk = false;
                    if (((word[0] >= '0' && word[0] <= '9') || (word[0] >= '0' && word[0] <= '9')) &&
                        ((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') ||
                         (word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9'))
                        )
                    {
                        //合并浮点数
                        word    = MergeFloat(initSeg, i, ref i);
                        mergeOk = true;
                    }
                    else if ((word[0] >= 'a' && word[0] <= 'z') ||
                             (word[0] >= 'A' && word[0] <= 'Z')
                             )
                    {
                        //合并成英文专业名词
                        String specialEnglish = MergeEnglishSpecialWord(_ExtractWords, initSeg, i, ref i);

                        if (specialEnglish != null)
                        {
                            InsertWordToArray(specialEnglish, retWords);
                            continue;
                        }

                        //合并邮件地址
                        if ((String)initSeg[i + 1] != "")
                        {
                            if (((String)initSeg[i + 1])[0] == '@')
                            {
                                word    = MergeEmail(initSeg, i, ref i);
                                mergeOk = true;
                            }
                        }
                    }

                    if (mergeOk)
                    {
                        InsertWordToArray(word, retWords);
                        continue;
                    }
                }


                if (word[0] < 0x4e00 || word[0] > 0x9fa5)
                {
                    //英文或符号,直接加入
                    InsertWordToArray(word, retWords);
                }
                else
                {
                    List <T_WordInfo> words = _ExtractWords.ExtractFullTextMaxMatch(word);
                    int  lastPos            = 0;
                    bool lstIsName          = false; //前一个词是人名

                    foreach (T_WordInfo wordInfo in words)
                    {
                        if (lastPos < wordInfo.Position)
                        {
                            /*
                             *                          String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos);
                             *
                             *                          InsertWordToArray(unMatchWord, retWords);
                             */
                            //中间有未匹配词,将单个字逐个加入
                            for (int j = lastPos; j < wordInfo.Position; j++)
                            {
                                InsertWordToArray(word[j].ToString(), retWords);
                            }
                        }


                        lastPos = wordInfo.Position + wordInfo.Word.Length;

                        //统计中文姓名的后缀
                        if (AutoStudy && lstIsName)
                        {
                            T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                            if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0)
                            {
                                _MatchNameRule.AddBefore(wordInfo.Word);
                            }

                            lstIsName = false;
                        }

                        //统计中文姓名的前缀
                        //如总统,主席等
                        if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0)
                        {
                            if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName)
                            {
                                T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                                _MatchNameRule.AddBefore(retWords[retWords.Count - 1]);
                            }

                            lstIsName = true;
                        }


                        InsertWordToArray(wordInfo.Word, retWords);
                    }

                    if (lastPos < word.Length)
                    {
                        //尾部有未匹配词,将单个字逐个加入
                        for (int j = lastPos; j < word.Length; j++)
                        {
                            InsertWordToArray(word[j].ToString(), retWords);
                        }

                        //InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords);
                    }
                }

                i++;
            }

            return(retWords);
        }