/// <summary> /// 预分词 /// </summary> /// <param name="str">要分词的句子</param> /// <returns>预分词后的字符串输出</returns> private List <String> PreSegment(String str) { ArrayList initSeg = new ArrayList(); if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg)) { return(new List <String>()); } List <String> retWords = new List <String>(); int i = 0; _ExtractWords.MatchDirection = MatchDirection; while (i < initSeg.Count) { String word = (String)initSeg[i]; if (word == "") { word = " "; } if (i < initSeg.Count - 1) { bool mergeOk = false; if (((word[0] >= '0' && word[0] <= '9') || (word[0] >= '0' && word[0] <= '9')) && ((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') || (word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9')) ) { //合并浮点数 word = MergeFloat(initSeg, i, ref i); mergeOk = true; } else if ((word[0] >= 'a' && word[0] <= 'z') || (word[0] >= 'A' && word[0] <= 'Z') ) { //合并成英文专业名词 String specialEnglish = MergeEnglishSpecialWord(_ExtractWords, initSeg, i, ref i); if (specialEnglish != null) { InsertWordToArray(specialEnglish, retWords); continue; } //合并邮件地址 if ((String)initSeg[i + 1] != "") { if (((String)initSeg[i + 1])[0] == '@') { word = MergeEmail(initSeg, i, ref i); mergeOk = true; } } } if (mergeOk) { InsertWordToArray(word, retWords); continue; } } if (word[0] < 0x4e00 || word[0] > 0x9fa5) { //英文或符号,直接加入 InsertWordToArray(word, retWords); } else { List <T_WordInfo> words = _ExtractWords.ExtractFullTextMaxMatch(word); int lastPos = 0; bool lstIsName = false; //前一个词是人名 foreach (T_WordInfo wordInfo in words) { if (lastPos < wordInfo.Position) { /* * String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos); * * InsertWordToArray(unMatchWord, retWords); */ //中间有未匹配词,将单个字逐个加入 for (int j = lastPos; j < wordInfo.Position; j++) { InsertWordToArray(word[j].ToString(), retWords); } } lastPos = wordInfo.Position + wordInfo.Word.Length; //统计中文姓名的后缀 if (AutoStudy && lstIsName) { T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag; if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0) { _MatchNameRule.AddBefore(wordInfo.Word); } lstIsName = false; } //统计中文姓名的前缀 //如总统,主席等 if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0) { if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName) { T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag; _MatchNameRule.AddBefore(retWords[retWords.Count - 1]); } lstIsName = true; } InsertWordToArray(wordInfo.Word, retWords); } if (lastPos < word.Length) { //尾部有未匹配词,将单个字逐个加入 for (int j = lastPos; j < word.Length; j++) { InsertWordToArray(word[j].ToString(), retWords); } //InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords); } } i++; } return(retWords); }