/// <summary> /// 分词并输出单词信息列表 /// </summary> /// <param name="str"></param> /// <returns></returns> public List <T_WordInfo> SegmentToWordInfos(String str) { //定时保存字典 SaveDictOnTime(); List <String> words = SegmentNoStopWord(str); List <T_WordInfo> retWords = new List <T_WordInfo>(); int position = 0; foreach (String word in words) { if (_FilterStopWords) { if (_ChsStopwordTbl[word] != null || _EngStopwordTbl[word] != null) { position += word.Length; continue; } } T_WordInfo wordInfo = new T_WordInfo(); wordInfo.Word = word; wordInfo.Position = position; retWords.Add(wordInfo); position += word.Length; } return(retWords); }
/// <summary> /// 提取全文 /// </summary> /// <param name="fullText">全文</param> /// <returns>返回T_WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns> public List <T_WordInfo> ExtractFullText(String fullText) { List <T_WordInfo> words = new List <T_WordInfo>(); if (fullText == null || fullText == "") { return(words); } T_DfaUnit cur = null; bool find = false; int pos = 0; int i = 0; while (i < fullText.Length) { cur = _WordDfa.Next(cur, fullText[i]); if (cur != null && !find) { pos = i; find = true; } if (find) { if (cur == null) { find = false; i = pos + 1; //有可能存在包含关系的词汇,所以需要回溯 continue; } else if (cur.QuitWord != null) { T_WordInfo wordInfo = new T_WordInfo(); wordInfo.Word = cur.QuitWord; wordInfo.Position = pos; wordInfo.Rank = _WordDfa.GetRank(wordInfo.Word); wordInfo.Tag = cur.Tag; words.Add(wordInfo); if (cur.Childs == null) { find = false; cur = null; i = pos + 1; //有可能存在包含关系的词汇,所以需要回溯 continue; } } } i++; } return(words); }
protected virtual double GetFreqWeight(List <T_WordInfo> words, List <int> list) { double weight = 0; for (int i = 0; i < list.Count; i++) { T_WordInfo w = (T_WordInfo)words[(int)list[i]]; T_DictStruct dict = (T_DictStruct)w.Tag; weight += dict.Frequency; } return(weight); }
protected virtual int GetPosWeight(List <T_WordInfo> words, List <int> list) { int weight = 0; for (int i = 0; i < list.Count - 1; i++) { T_WordInfo w1 = (T_WordInfo)words[(int)list[i]]; T_WordInfo w2 = (T_WordInfo)words[(int)list[i + 1]]; if (_PosBinRule.Match(w1.Word, w2.Word)) { weight++; } } return(weight); }
/// <summary> /// 合并英文专用词。 /// 如果字典中有英文专用词如U.S.A, C++.C#等 /// 需要对初步分词后的英文和字母进行合并 /// </summary> protected virtual string MergeEnglishSpecialWord(CExtractWords extractWords, ArrayList words, int start, ref int end) { StringBuilder str = new StringBuilder(); int i; for (i = start; i < words.Count; i++) { string word = (string)words[i]; //word 为空或者为空格回车换行等分割符号,中断扫描 if (word.Trim() == "") { break; } //如果遇到中文,中断扫描 if (word[0] >= 0x4e00 && word[0] <= 0x9fa5) { break; } str.Append(word); } String mergeString = str.ToString(); List <T_WordInfo> exWords = extractWords.ExtractFullText(mergeString); if (exWords.Count == 1) { T_WordInfo info = (T_WordInfo)exWords[0]; if (info.Word.Length == mergeString.Length) { end = i; return(mergeString); } } return(null); }
/// <summary> /// 最大匹配提取全文中所有匹配的单词 /// </summary> /// <param name="fullText">全文</param> /// <returns>返回T_WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns> public List <T_WordInfo> ExtractFullTextMaxMatch(String fullText) { List <T_WordInfo> retWords = new List <T_WordInfo>(); List <T_WordInfo> words = ExtractFullText(fullText); int i = 0; while (i < words.Count) { T_WordInfo wordInfo = (T_WordInfo)words[i]; int j; int rangeEndPos = 0; for (j = i; j < words.Count - 1; j++) { if (j - i > 16) { //嵌套太多的情况一般很少发生,如果发生,强行中断,以免造成博弈树遍历层次过多 //降低系统效率 break; } if (rangeEndPos < ((T_WordInfo)words[j]).Position + ((T_WordInfo)words[j]).Word.Length - 1) { rangeEndPos = ((T_WordInfo)words[j]).Position + ((T_WordInfo)words[j]).Word.Length - 1; } if (rangeEndPos < ((T_WordInfo)words[j + 1]).Position) { break; } } if (j > i) { int spaceNum = 0; int deep = 0; _GameNodes = new List <int>(); _MinDeep = 65535; _MinSpace = 65535 * 256; GameTree(words, new List <int>(), true, i, j, ref spaceNum, ref deep); foreach (int index in _GameNodes) { T_WordInfo info = (T_WordInfo)words[index]; retWords.Add(info); } i = j + 1; continue; } else { retWords.Add(wordInfo); i++; } } return(retWords); }
/// <summary> /// 博弈树 /// </summary> /// <param name="words"></param> /// <param name="nodes"></param> /// <param name="init"></param> /// <param name="begin"></param> /// <param name="end"></param> /// <param name="spaceNum"></param> /// <param name="deep"></param> /// <returns></returns> private List <int> GameTree(List <T_WordInfo> words, List <int> nodes, bool init, int begin, int end, ref int spaceNum, ref int deep) { if (init) { int startPos = ((T_WordInfo)words[begin]).Position; for (int i = begin; i <= end; i++) { T_WordInfo wordInfo = (T_WordInfo)words[i]; spaceNum = wordInfo.Position - startPos; deep = 0; List <int> oneNodes; if (i == end) { oneNodes = new List <int>(); oneNodes.Add(i); deep++; } else { oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep); } if (oneNodes != null) { bool select = false; if (_MinSpace > spaceNum || (_MinSpace == spaceNum && deep < _MinDeep)) { select = true; if (_MinSpace == 0) { if (SelectByFreqEvent != null) { select = SelectByFreqEvent(words, _GameNodes, oneNodes); } } } else if (_MinDeep == deep && _MinSpace == spaceNum) { if (_CompareByPos != null && _MinSpace == 0) { select = _CompareByPos(words, _GameNodes, oneNodes); } else { select = CompareGroup(words, _GameNodes, oneNodes, MatchDirection); } } if (select) { _MinDeep = deep; _MinSpace = spaceNum; _GameNodes.Clear(); foreach (int obj in oneNodes) { _GameNodes.Add(obj); } } } deep = 0; nodes.Clear(); } } else { nodes.Add(begin); deep++; T_WordInfo last = (T_WordInfo)words[begin]; bool nextStep = false; bool reach = false; int endPos = last.Position + last.Word.Length - 1; int oldDeep = deep; int oldSpace = spaceNum; for (int i = begin + 1; i <= end; i++) { T_WordInfo cur = (T_WordInfo)words[i]; if (endPos < cur.Position + cur.Word.Length - 1) { endPos = cur.Position + cur.Word.Length - 1; } if (last.Position + last.Word.Length <= cur.Position) { nextStep = true; if (reach) { reach = false; spaceNum = oldSpace; deep = oldDeep; nodes.RemoveAt(nodes.Count - 1); } spaceNum += cur.Position - (last.Position + last.Word.Length); List <int> oneNodes; oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep); if (oneNodes != null) { bool select = false; if (_MinSpace > spaceNum || (_MinSpace == spaceNum && deep < _MinDeep)) { select = true; } else if (_MinDeep == deep && _MinSpace == spaceNum) { if (_CompareByPos != null && _MinSpace == 0) { select = _CompareByPos(words, _GameNodes, oneNodes); } else { select = CompareGroup(words, _GameNodes, oneNodes, MatchDirection); } } if (select) { reach = true; nextStep = false; _MinDeep = deep; _MinSpace = spaceNum; _GameNodes.Clear(); foreach (int obj in oneNodes) { _GameNodes.Add(obj); } } else { spaceNum = oldSpace; deep = oldDeep; nodes.RemoveRange(deep, nodes.Count - deep); } } else { spaceNum = oldSpace; deep = oldDeep; nodes.RemoveRange(deep, nodes.Count - deep); } } } if (!nextStep) { spaceNum += endPos - (last.Position + last.Word.Length - 1); List <int> ret = new List <int>(); foreach (int obj in nodes) { ret.Add(obj); } return(ret); } } return(null); }