示例#1
0
        /// <summary>
        /// 提取全文
        /// </summary>
        /// <param name="fullText">全文</param>
        /// <returns>返回WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns>
        public List <WordInfo> ExtractFullText(String fullText)
        {
            List <WordInfo> words = new List <WordInfo>();

            if (fullText == null || fullText == "")
            {
                return(words);
            }

            T_DfaUnit cur  = null;
            bool      find = false;
            int       pos  = 0;
            int       i    = 0;

            while (i < fullText.Length)
            {
                cur = _WordDfa.Next(cur, fullText[i]);
                if (cur != null && !find)
                {
                    pos  = i;
                    find = true;
                }

                if (find)
                {
                    if (cur == null)
                    {
                        find = false;
                        i    = pos + 1; //有可能存在包含关系的词汇,所以需要回溯
                        continue;
                    }
                    else if (cur.QuitWord != null)
                    {
                        WordInfo wordInfo = new WordInfo();
                        wordInfo.Word     = cur.QuitWord;
                        wordInfo.Position = pos;
                        wordInfo.Rank     = _WordDfa.GetRank(wordInfo.Word);
                        wordInfo.Tag      = cur.Tag;
                        words.Add(wordInfo);

                        if (cur.Childs == null)
                        {
                            find = false;
                            cur  = null;
                            i    = pos + 1; //有可能存在包含关系的词汇,所以需要回溯
                            continue;
                        }
                    }
                }

                i++;
            }

            return(words);
        }
示例#2
0
        /// <summary>
        /// 向有穷自动机输入单词
        /// </summary>
        /// <param name="word">单词</param>
        /// <param name="rank">单词的权重</param>
        public void InsertWordToDfa(String word, int rank, object tag)
        {
            if (word == null || word == "")
            {
                return;
            }

            if (rank != 0)
            {
                _UseRank = true;
            }

            if (_WordsTbl[word] != null)
            {
                return;
            }

            _WordsTbl[word] = rank;

            int       pos;
            T_DfaUnit unit = GetLastMatchUnit(word, out pos);

            bool needTrans = false;

            for (int i = pos; i < word.Length; i++)
            {
                if (!needTrans && word[i] == '\\')
                {
                    if (i == word.Length - 1)
                    {
                        //最后一个字符是转义符号
                        throw (new Exception("Last char is trans char!"));
                    }
                    //转义
                    needTrans = true;
                    continue;
                }

                if (i == word.Length - 1)
                {
                    unit = AddChar(unit, word[i], word, needTrans, tag);
                }
                else
                {
                    unit = AddChar(unit, word[i], null, needTrans, tag);
                }

                needTrans = false;
            }
        }
示例#3
0
        Hashtable _FstCharTbl; //首字Hash表,作为有穷自动机的入口

        private T_DfaUnit AddChar(T_DfaUnit cur, Char c, String quitWord, bool needTrans, object tag)
        {
            T_DfaUnit unit = new T_DfaUnit();

            unit.Char      = c;
            unit.NeedTrans = needTrans;
            unit.Childs    = null;
            unit.QuitWord  = quitWord;
            if (quitWord != null)
            {
                unit.Tag = tag;
            }

            unit.NextFriend = null;

            if (cur == null)
            {
                Debug.Assert(_FstCharTbl[c] == null);
                _FstCharTbl[c] = unit;
            }
            else
            {
                if (cur.Childs == null)
                {
                    cur.Childs = unit;
                }
                else
                {
                    T_DfaUnit friend    = cur.Childs;
                    T_DfaUnit oldFriend = friend;
                    while (friend != null)
                    {
                        oldFriend = friend;
                        friend    = friend.NextFriend;
                    }

                    oldFriend.NextFriend = unit;
                }
            }

            return(unit);
        }
示例#4
0
        /// <summary>
        /// 遍历有穷自动机,获取最后一个和输入单词匹配的单元
        /// </summary>
        /// <param name="word">单词</param>
        /// <param name="pos">输出位置</param>
        /// <returns>最后一个匹配单元,如果第一个字符就不能匹配,返回null</returns>
        private T_DfaUnit GetLastMatchUnit(String word, out int pos)
        {
            pos = 0;
            T_DfaUnit cur  = null;
            T_DfaUnit last = null;

            while (pos < word.Length)
            {
                last = cur;
                cur  = Next(cur, word[pos]);
                if (cur == null)
                {
                    return(last);
                }

                pos++;
            }

            cur.QuitWord = word;
            return(cur);
        }
示例#5
0
        public T_DfaUnit Next(T_DfaUnit cur, Char c)
        {
            if (cur == null)
            {
                T_DfaUnit unit = (T_DfaUnit)_FstCharTbl[c];
                if (unit == null)
                {
                    return(null);
                }
                else
                {
                    return(unit);
                }
            }
            else
            {
                T_DfaUnit unit = cur.Childs;
                while (unit != null)
                {
                    if (unit.NeedTrans)
                    {
                        if (TransCharEqual(unit.Char, c))
                        {
                            return(cur);
                        }
                    }

                    if (unit.Char == c)
                    {
                        return(unit);
                    }

                    unit = unit.NextFriend;
                }
            }


            return(null);
        }