/// <summary> /// 从文本文件读取字典 /// </summary> /// <param name="fileName"></param> public WordPosCollection LoadFromTextDict(String fileName) { WordPosCollection dictFile = new WordPosCollection(); String dictStr = CFile.ReadFileToString(fileName, "utf-8"); String[] words = CRegex.Split(dictStr, "\r\n"); foreach (String word in words) { String[] wp = CRegex.Split(word, @"\|"); if (wp == null) { continue; } if (wp.Length != 2) { continue; } int pos = 0; try { pos = int.Parse(wp[1]); } catch { continue; } WordPos dict = new WordPos(wp[0], pos); if (dict.Word.Contains("一") || dict.Word.Contains("二") || dict.Word.Contains("三") || dict.Word.Contains("四") || dict.Word.Contains("五") || dict.Word.Contains("六") || dict.Word.Contains("七") || dict.Word.Contains("八") || dict.Word.Contains("九") || dict.Word.Contains("十")) { dict.Pos |= (int)PosEnum.POS_A_M; } if (dict.Word == "字典") { dict.Pos = (int)PosEnum.POS_D_N; } dictFile.WordPosList.Add(dict); } return(dictFile); }
/// <summary> /// 预分词 /// </summary> /// <param name="str">要分词的句子</param> /// <returns>预分词后的字符串输出</returns> private List <string> PreSegment(String str) { List <string> initSeg = new List <string>(); //如果不包括数字串、日期、英文字母、汉字就返回,有就分成多个块 if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg)) { return(new List <string>()); } List <string> retWords = new List <string>(); int i = 0; WordSegService.ExtractInfo.MatchDirection = MatchDirection; while (i < initSeg.Count) { String word = initSeg[i]; if (word == "") { word = " "; } if (i < initSeg.Count - 1) { bool mergeOk = false; if (((word[0] >= '0' && word[0] <= '9') || (word[0] >= '0' && word[0] <= '9')) && ((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') || (word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9')) )//如果word是数字开头和数字结尾 { word = MergeFloat(initSeg, i, ref i); mergeOk = true; } else if ((word[0] >= 'a' && word[0] <= 'z') || (word[0] >= 'A' && word[0] <= 'Z')) { if ((String)initSeg[i + 1] != "") { if (((String)initSeg[i + 1])[0] == '@') { word = MergeEmail(initSeg, i, ref i); mergeOk = true; } } } if (mergeOk) { InsertWordToArray(word, retWords); continue; } } if (word[0] < 0x4e00 || word[0] > 0x9fa5) { InsertWordToArray(word, retWords); } else { List <WordInfo> words = WordSegService.ExtractInfo.ExtractFullTextMaxMatch(word); int lastPos = 0; foreach (WordInfo wordInfo in words) { if (lastPos < wordInfo.Position) {/* * String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos); * * InsertWordToArray(unMatchWord, retWords); */ //将没有匹配的词元进行一个字一词划分 for (int j = lastPos; j < wordInfo.Position; j++) { InsertWordToArray(word[j].ToString(), retWords); } } lastPos = wordInfo.Position + wordInfo.Word.Length; InsertWordToArray(wordInfo.Word, retWords); } //将剩下的字做为一个整词划分 if (lastPos < word.Length) { InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords); } } i++; } return(retWords); }