/// <summary> /// 提取全文 /// </summary> /// <param name="fullText">全文</param> /// <returns>返回T_WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns> public List <T_WordInfo> ExtractFullText(String fullText) { List <T_WordInfo> words = new List <T_WordInfo>(); if (fullText == null || fullText == "") { return(words); } T_DfaUnit cur = null; bool find = false; int pos = 0; int i = 0; while (i < fullText.Length) { cur = _WordDfa.Next(cur, fullText[i]); if (cur != null && !find) { pos = i; find = true; } if (find) { if (cur == null) { find = false; i = pos + 1; //有可能存在包含关系的词汇,所以需要回溯 continue; } else if (cur.QuitWord != null) { T_WordInfo wordInfo = new T_WordInfo(); wordInfo.Word = cur.QuitWord; wordInfo.Position = pos; wordInfo.Rank = _WordDfa.GetRank(wordInfo.Word); wordInfo.Tag = cur.Tag; words.Add(wordInfo); if (cur.Childs == null) { find = false; cur = null; i = pos + 1; //有可能存在包含关系的词汇,所以需要回溯 continue; } } } i++; } return(words); }
/// <summary> /// 向有穷自动机输入单词 /// </summary> /// <param name="word">单词</param> /// <param name="rank">单词的权重</param> public void InsertWordToDfa(String word, int rank, object tag) { if (word == null || word == "") { return; } if (rank != 0) { _UseRank = true; } if (_WordsTbl[word] != null) { return; } T_InnerInfo innerInfo = new T_InnerInfo(); innerInfo.Rank = rank; innerInfo.Tag = tag; _WordsTbl[word] = innerInfo; int pos; T_DfaUnit unit = GetLastMatchUnit(word, out pos, tag); bool needTrans = false; for (int i = pos; i < word.Length; i++) { if (!needTrans && word[i] == '\\') { if (i == word.Length - 1) { //最后一个字符是转义符号 throw (new Exception("Last char is trans char!")); } //转义 needTrans = true; continue; } if (i == word.Length - 1) { unit = AddChar(unit, word[i], word, needTrans, tag); } else { unit = AddChar(unit, word[i], null, needTrans, tag); } needTrans = false; } }
Hashtable _FstCharTbl; //首字Hash表,作为有穷自动机的入口 private T_DfaUnit AddChar(T_DfaUnit cur, Char c, String quitWord, bool needTrans, object tag) { T_DfaUnit unit = new T_DfaUnit(); unit.Char = c; unit.NeedTrans = needTrans; unit.Childs = null; unit.QuitWord = quitWord; if (quitWord != null) { unit.Tag = tag; } unit.NextFriend = null; if (cur == null) { Debug.Assert(_FstCharTbl[c] == null); _FstCharTbl[c] = unit; } else { if (cur.Childs == null) { cur.Childs = unit; } else { T_DfaUnit friend = cur.Childs; T_DfaUnit oldFriend = friend; while (friend != null) { oldFriend = friend; friend = friend.NextFriend; } oldFriend.NextFriend = unit; } } return(unit); }
/// <summary> /// 遍历有穷自动机,获取最后一个和输入单词匹配的单元 /// </summary> /// <param name="word">单词</param> /// <param name="pos">输出位置</param> /// <returns>最后一个匹配单元,如果第一个字符就不能匹配,返回null</returns> private T_DfaUnit GetLastMatchUnit(String word, out int pos, object tag) { pos = 0; T_DfaUnit cur = null; T_DfaUnit last = null; while (pos < word.Length) { last = cur; cur = Next(cur, word[pos]); if (cur == null) { return(last); } pos++; } cur.QuitWord = word; cur.Tag = tag; return(cur); }
public T_DfaUnit Next(T_DfaUnit cur, Char c) { if (cur == null) { T_DfaUnit unit = (T_DfaUnit)_FstCharTbl[c]; if (unit == null) { return(null); } else { return(unit); } } else { T_DfaUnit unit = cur.Childs; while (unit != null) { if (unit.NeedTrans) { if (TransCharEqual(unit.Char, c)) { return(cur); } } if (unit.Char == c) { return(unit); } unit = unit.NextFriend; } } return(null); }