Пример #1
0
        public WordInfo(Dict.PositionLength pl, string oringinalText, Match.MatchParameter parameters)
        {
            this.Word      = oringinalText.Substring(pl.Position, pl.Length);
            this.Pos       = pl.WordAttr.Pos;
            this.Frequency = pl.WordAttr.Frequency;
            this.WordType  = WordType.SimplifiedChinese;
            this.Position  = pl.Position;

            switch (pl.Level)
            {
            case 0:
                this.Rank = parameters.BestRank;
                break;

            case 1:
                this.Rank = parameters.SecRank;
                break;

            case 2:
                this.Rank = parameters.ThirdRank;
                break;

            case 3:
                this.Rank = parameters.SingleRank;
                break;

            default:
                this.Rank = parameters.BestRank;
                break;
            }
        }
Пример #2
0
        public MatchParameter Clone()
        {
            MatchParameter result = new MatchParameter();

            foreach (FieldInfo fi in this.GetType().GetFields())
            {
                object value = fi.GetValue(this);
                fi.SetValue(result, value);
            }

            return(result);
        }
Пример #3
0
        public ICollection <WordInfo> DoSegment(string text, Match.MatchOptions options, Match.MatchParameter parameters)
        {
            if (string.IsNullOrEmpty(text))
            {
                return(new SuperLinkedList <WordInfo>());
            }

            try
            {
                Dict.DictionaryLoader.Lock.Enter(微博舆论.Framework.Lock.Mode.Share);
                _Options    = options;
                _Parameters = parameters;

                Init();

                if (_Options == null)
                {
                    _Options = Setting.PanGuSettings.Config.MatchOptions;
                }

                if (_Parameters == null)
                {
                    _Parameters = Setting.PanGuSettings.Config.Parameters;
                }

                SuperLinkedList <WordInfo> result = PreSegment(text);

                if (_Options.FilterStopWords)
                {
                    FilterStopWord(result);
                }

                ProcessAfterSegment(text, result);

                return(result);
            }
            finally
            {
                Dict.DictionaryLoader.Lock.Leave();
            }
        }
Пример #4
0
        public SuperLinkedList <WordInfo> Match(微博舆论.Dict.PositionLength[] positionLenArr, string orginalText, int count)
        {
            if (_Options == null)
            {
                _Options = Setting.PanGuSettings.Config.MatchOptions;
            }

            if (_Parameters == null)
            {
                _Parameters = Setting.PanGuSettings.Config.Parameters;
            }

            int[] masks      = new int[orginalText.Length];
            int   redundancy = _Parameters.Redundancy;

            SuperLinkedList <WordInfo> result = new SuperLinkedList <WordInfo>();

            if (count == 0)
            {
                if (_Options.UnknownWordIdentify)
                {
                    WordInfo wi = new WordInfo();
                    wi.Word     = orginalText;
                    wi.Position = 0;
                    wi.WordType = WordType.None;
                    wi.Rank     = 1;
                    result.AddFirst(wi);
                    return(result);
                }
                else
                {
                    int position = 0;
                    foreach (char c in orginalText)
                    {
                        WordInfo wi = new WordInfo();
                        wi.Word     = c.ToString();
                        wi.Position = position++;
                        wi.WordType = WordType.None;
                        wi.Rank     = 1;
                        result.AddLast(wi);
                    }

                    return(result);
                }
            }

            Node[] leafNodeArray = GetLeafNodeArray(positionLenArr, orginalText.Length, count);

            //下面两句是不采用孤立点分割算法的老算法
            //Node[] leafNodeArray = GetLeafNodeArrayCore(positionLenArr, orginalText.Length, count);
            //Framework.QuickSort<Node>.TopSort(leafNodeArray,
            //    _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer());

            int j = 0;

            // 获取前TopRecord个单词序列
            foreach (Node node in leafNodeArray)
            {
                if (leafNodeArray[j] == null)
                {
                    break;
                }

                if (j >= TopRecord || j >= leafNodeArray.Length)
                {
                    break;
                }

                Dict.PositionLength[] comb = new 微博舆论.Dict.PositionLength[node.AboveCount];

                int  i   = node.AboveCount - 1;
                Node cur = node;

                while (i >= 0)
                {
                    comb[i] = cur.PositionLength;
                    cur     = cur.Parent;
                    i--;
                }

                _AllCombinations.Add(comb);

                j++;
            }

            //Force single word
            //强制一元分词
            if (_Options.ForceSingleWord)
            {
                Dict.PositionLength[] comb = new 微博舆论.Dict.PositionLength[orginalText.Length];

                for (int i = 0; i < comb.Length; i++)
                {
                    微博舆论.Dict.PositionLength pl = new 微博舆论.Dict.PositionLength(i, 1, new WordAttribute(orginalText[i].ToString(), POS.POS_UNK, 0));
                    pl.Level = 3;
                    comb[i]  = pl;
                }

                _AllCombinations.Add(comb);
            }

            if (_AllCombinations.Count > 0)
            {
                ICollection <Dict.PositionLength> positionCollection = MergeAllCombinations(redundancy);

                foreach (Dict.PositionLength pl in positionCollection)
                //for (int i = 0; i < _AllCombinations[0].Length; i++)
                {
                    //result.AddLast(new WordInfo(_AllCombinations[0][i], orginalText));
                    result.AddLast(new WordInfo(pl, orginalText, _Parameters));
                    if (pl.Length > 1)
                    {
                        for (int k = pl.Position;
                             k < pl.Position + pl.Length; k++)
                        {
                            masks[k] = 2;
                        }
                    }
                    else
                    {
                        masks[pl.Position] = 1;
                    }
                }
            }

            #region 合并未登录词

            bool            needRemoveSingleWord;
            List <WordInfo> unknownWords = GetUnknowWords(masks, orginalText, out needRemoveSingleWord);

            //合并到结果序列的对应位置中
            if (unknownWords.Count > 0)
            {
                SuperLinkedListNode <WordInfo> cur = result.First;

                if (needRemoveSingleWord && !_Options.ForceSingleWord)
                {
                    //Remove single word need be remvoed

                    while (cur != null)
                    {
                        if (cur.Value.Word.Length == 1)
                        {
                            if (masks[cur.Value.Position] == 11)
                            {
                                SuperLinkedListNode <WordInfo> removeItem = cur;

                                cur = cur.Next;

                                result.Remove(removeItem);

                                continue;
                            }
                        }

                        cur = cur.Next;
                    }
                }

                cur = result.First;

                j = 0;

                while (cur != null)
                {
                    if (cur.Value.Position >= unknownWords[j].Position)
                    {
                        result.AddBefore(cur, unknownWords[j]);
                        j++;
                        if (j >= unknownWords.Count)
                        {
                            break;
                        }
                    }

                    if (cur.Value.Position < unknownWords[j].Position)
                    {
                        cur = cur.Next;
                    }
                }

                while (j < unknownWords.Count)
                {
                    result.AddLast(unknownWords[j]);
                    j++;
                }
            }


            #endregion



            return(result);
        }
Пример #5
0
        public ICollection<WordInfo> DoSegment(string text, Match.MatchOptions options, Match.MatchParameter parameters)
        {
            if (string.IsNullOrEmpty(text))
            {
                return new SuperLinkedList<WordInfo>();
            }

            try
            {
                Dict.DictionaryLoader.Lock.Enter(微博舆论.Framework.Lock.Mode.Share);
                _Options = options;
                _Parameters = parameters;

                Init();

                if (_Options == null)
                {
                    _Options = Setting.PanGuSettings.Config.MatchOptions;
                }

                if (_Parameters == null)
                {
                    _Parameters = Setting.PanGuSettings.Config.Parameters;
                }

                SuperLinkedList<WordInfo> result = PreSegment(text);

                if (_Options.FilterStopWords)
                {
                    FilterStopWord(result);
                }

                ProcessAfterSegment(text, result);

                return result;
            }
            finally
            {
                Dict.DictionaryLoader.Lock.Leave();
            }
        }
Пример #6
0
        public MatchParameter Clone()
        {
            MatchParameter result = new MatchParameter();

            foreach (FieldInfo fi in this.GetType().GetFields())
            {
                object value = fi.GetValue(this);
                fi.SetValue(result, value);
            }

            return result;
        }