Beispiel #1
0
        /// <summary>
        /// 分词并输出单词信息列表
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public List <T_WordInfo> SegmentToWordInfos(String str)
        {
            //定时保存字典
            SaveDictOnTime();

            List <String> words = SegmentNoStopWord(str);

            List <T_WordInfo> retWords = new List <T_WordInfo>();
            int position = 0;

            foreach (String word in words)
            {
                if (_FilterStopWords)
                {
                    if (_ChsStopwordTbl[word] != null || _EngStopwordTbl[word] != null)
                    {
                        position += word.Length;
                        continue;
                    }
                }

                T_WordInfo wordInfo = new T_WordInfo();
                wordInfo.Word     = word;
                wordInfo.Position = position;
                retWords.Add(wordInfo);
                position += word.Length;
            }

            return(retWords);
        }
Beispiel #2
0
        /// <summary>
        /// 提取全文
        /// </summary>
        /// <param name="fullText">全文</param>
        /// <returns>返回T_WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns>
        public List <T_WordInfo> ExtractFullText(String fullText)
        {
            List <T_WordInfo> words = new List <T_WordInfo>();

            if (fullText == null || fullText == "")
            {
                return(words);
            }

            T_DfaUnit cur  = null;
            bool      find = false;
            int       pos  = 0;
            int       i    = 0;

            while (i < fullText.Length)
            {
                cur = _WordDfa.Next(cur, fullText[i]);
                if (cur != null && !find)
                {
                    pos  = i;
                    find = true;
                }

                if (find)
                {
                    if (cur == null)
                    {
                        find = false;
                        i    = pos + 1; //有可能存在包含关系的词汇,所以需要回溯
                        continue;
                    }
                    else if (cur.QuitWord != null)
                    {
                        T_WordInfo wordInfo = new T_WordInfo();
                        wordInfo.Word     = cur.QuitWord;
                        wordInfo.Position = pos;
                        wordInfo.Rank     = _WordDfa.GetRank(wordInfo.Word);
                        wordInfo.Tag      = cur.Tag;
                        words.Add(wordInfo);

                        if (cur.Childs == null)
                        {
                            find = false;
                            cur  = null;
                            i    = pos + 1; //有可能存在包含关系的词汇,所以需要回溯
                            continue;
                        }
                    }
                }

                i++;
            }

            return(words);
        }
Beispiel #3
0
        protected virtual double GetFreqWeight(List <T_WordInfo> words, List <int> list)
        {
            double weight = 0;

            for (int i = 0; i < list.Count; i++)
            {
                T_WordInfo   w    = (T_WordInfo)words[(int)list[i]];
                T_DictStruct dict = (T_DictStruct)w.Tag;
                weight += dict.Frequency;
            }

            return(weight);
        }
Beispiel #4
0
        protected virtual int GetPosWeight(List <T_WordInfo> words, List <int> list)
        {
            int weight = 0;

            for (int i = 0; i < list.Count - 1; i++)
            {
                T_WordInfo w1 = (T_WordInfo)words[(int)list[i]];
                T_WordInfo w2 = (T_WordInfo)words[(int)list[i + 1]];
                if (_PosBinRule.Match(w1.Word, w2.Word))
                {
                    weight++;
                }
            }

            return(weight);
        }
Beispiel #5
0
        /// <summary>
        /// 合并英文专用词。
        /// 如果字典中有英文专用词如U.S.A, C++.C#等
        /// 需要对初步分词后的英文和字母进行合并
        /// </summary>
        protected virtual string MergeEnglishSpecialWord(CExtractWords extractWords, ArrayList words, int start, ref int end)
        {
            StringBuilder str = new StringBuilder();

            int i;

            for (i = start; i < words.Count; i++)
            {
                string word = (string)words[i];

                //word 为空或者为空格回车换行等分割符号,中断扫描
                if (word.Trim() == "")
                {
                    break;
                }

                //如果遇到中文,中断扫描
                if (word[0] >= 0x4e00 && word[0] <= 0x9fa5)
                {
                    break;
                }

                str.Append(word);
            }

            String            mergeString = str.ToString();
            List <T_WordInfo> exWords     = extractWords.ExtractFullText(mergeString);

            if (exWords.Count == 1)
            {
                T_WordInfo info = (T_WordInfo)exWords[0];
                if (info.Word.Length == mergeString.Length)
                {
                    end = i;
                    return(mergeString);
                }
            }

            return(null);
        }
Beispiel #6
0
        /// <summary>
        /// 最大匹配提取全文中所有匹配的单词
        /// </summary>
        /// <param name="fullText">全文</param>
        /// <returns>返回T_WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns>
        public List <T_WordInfo> ExtractFullTextMaxMatch(String fullText)
        {
            List <T_WordInfo> retWords = new List <T_WordInfo>();
            List <T_WordInfo> words    = ExtractFullText(fullText);

            int i = 0;

            while (i < words.Count)
            {
                T_WordInfo wordInfo = (T_WordInfo)words[i];

                int j;

                int rangeEndPos = 0;

                for (j = i; j < words.Count - 1; j++)
                {
                    if (j - i > 16)
                    {
                        //嵌套太多的情况一般很少发生,如果发生,强行中断,以免造成博弈树遍历层次过多
                        //降低系统效率
                        break;
                    }

                    if (rangeEndPos < ((T_WordInfo)words[j]).Position + ((T_WordInfo)words[j]).Word.Length - 1)
                    {
                        rangeEndPos = ((T_WordInfo)words[j]).Position + ((T_WordInfo)words[j]).Word.Length - 1;
                    }

                    if (rangeEndPos <
                        ((T_WordInfo)words[j + 1]).Position)
                    {
                        break;
                    }
                }

                if (j > i)
                {
                    int spaceNum = 0;
                    int deep     = 0;
                    _GameNodes = new List <int>();
                    _MinDeep   = 65535;
                    _MinSpace  = 65535 * 256;

                    GameTree(words, new List <int>(), true, i, j, ref spaceNum, ref deep);

                    foreach (int index in _GameNodes)
                    {
                        T_WordInfo info = (T_WordInfo)words[index];
                        retWords.Add(info);
                    }

                    i = j + 1;
                    continue;
                }
                else
                {
                    retWords.Add(wordInfo);
                    i++;
                }
            }

            return(retWords);
        }
Beispiel #7
0
        /// <summary>
        /// 博弈树
        /// </summary>
        /// <param name="words"></param>
        /// <param name="nodes"></param>
        /// <param name="init"></param>
        /// <param name="begin"></param>
        /// <param name="end"></param>
        /// <param name="spaceNum"></param>
        /// <param name="deep"></param>
        /// <returns></returns>
        private List <int> GameTree(List <T_WordInfo> words, List <int> nodes, bool init, int begin, int end, ref int spaceNum, ref int deep)
        {
            if (init)
            {
                int startPos = ((T_WordInfo)words[begin]).Position;
                for (int i = begin; i <= end; i++)
                {
                    T_WordInfo wordInfo = (T_WordInfo)words[i];
                    spaceNum = wordInfo.Position - startPos;
                    deep     = 0;
                    List <int> oneNodes;

                    if (i == end)
                    {
                        oneNodes = new List <int>();
                        oneNodes.Add(i);
                        deep++;
                    }
                    else
                    {
                        oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep);
                    }

                    if (oneNodes != null)
                    {
                        bool select = false;

                        if (_MinSpace > spaceNum ||
                            (_MinSpace == spaceNum && deep < _MinDeep))
                        {
                            select = true;

                            if (_MinSpace == 0)
                            {
                                if (SelectByFreqEvent != null)
                                {
                                    select = SelectByFreqEvent(words, _GameNodes, oneNodes);
                                }
                            }
                        }
                        else if (_MinDeep == deep && _MinSpace == spaceNum)
                        {
                            if (_CompareByPos != null && _MinSpace == 0)
                            {
                                select = _CompareByPos(words, _GameNodes, oneNodes);
                            }
                            else
                            {
                                select = CompareGroup(words, _GameNodes, oneNodes, MatchDirection);
                            }
                        }


                        if (select)
                        {
                            _MinDeep  = deep;
                            _MinSpace = spaceNum;
                            _GameNodes.Clear();
                            foreach (int obj in oneNodes)
                            {
                                _GameNodes.Add(obj);
                            }
                        }
                    }
                    deep = 0;
                    nodes.Clear();
                }
            }
            else
            {
                nodes.Add(begin);
                deep++;

                T_WordInfo last = (T_WordInfo)words[begin];

                bool nextStep = false;
                bool reach    = false;
                int  endPos   = last.Position + last.Word.Length - 1;

                int oldDeep  = deep;
                int oldSpace = spaceNum;

                for (int i = begin + 1; i <= end; i++)
                {
                    T_WordInfo cur = (T_WordInfo)words[i];

                    if (endPos < cur.Position + cur.Word.Length - 1)
                    {
                        endPos = cur.Position + cur.Word.Length - 1;
                    }


                    if (last.Position + last.Word.Length <= cur.Position)
                    {
                        nextStep = true;

                        if (reach)
                        {
                            reach    = false;
                            spaceNum = oldSpace;
                            deep     = oldDeep;
                            nodes.RemoveAt(nodes.Count - 1);
                        }

                        spaceNum += cur.Position - (last.Position + last.Word.Length);
                        List <int> oneNodes;
                        oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep);

                        if (oneNodes != null)
                        {
                            bool select = false;

                            if (_MinSpace > spaceNum ||
                                (_MinSpace == spaceNum && deep < _MinDeep))
                            {
                                select = true;
                            }
                            else if (_MinDeep == deep && _MinSpace == spaceNum)
                            {
                                if (_CompareByPos != null && _MinSpace == 0)
                                {
                                    select = _CompareByPos(words, _GameNodes, oneNodes);
                                }
                                else
                                {
                                    select = CompareGroup(words, _GameNodes, oneNodes, MatchDirection);
                                }
                            }


                            if (select)
                            {
                                reach     = true;
                                nextStep  = false;
                                _MinDeep  = deep;
                                _MinSpace = spaceNum;
                                _GameNodes.Clear();
                                foreach (int obj in oneNodes)
                                {
                                    _GameNodes.Add(obj);
                                }
                            }
                            else
                            {
                                spaceNum = oldSpace;
                                deep     = oldDeep;
                                nodes.RemoveRange(deep, nodes.Count - deep);
                            }
                        }
                        else
                        {
                            spaceNum = oldSpace;
                            deep     = oldDeep;
                            nodes.RemoveRange(deep, nodes.Count - deep);
                        }
                    }
                }

                if (!nextStep)
                {
                    spaceNum += endPos - (last.Position + last.Word.Length - 1);

                    List <int> ret = new List <int>();

                    foreach (int obj in nodes)
                    {
                        ret.Add(obj);
                    }

                    return(ret);
                }
            }

            return(null);
        }