Beispiel #1
0
        /// <summary>
        ///根据孤立点拆分长句,然后再分别对各个句子的片段进行分词.
        ///长中文句子的分词困扰了我3年,一直没有好的解决方案。没想到在观看
        ///2010年世界杯开幕式时,我突发灵感,想出了这个孤立点分割拆分长句的
        ///算法,彻底解决的这个长期困扰我的难题.
        ///eaglet 11th Jun 2010 注释留念
        /// </summary>
        /// <param name="positionLenArr">保护位置和长度信息的单词分量数组</param>
        /// <param name="orginalTextLength">原始字符串长度</param>
        /// <param name="count">positionLenArr 的 count</param>
        /// <returns></returns>
        private Node[] GetLeafNodeArray(微博舆论.Dict.PositionLength[] positionLenArr, int orginalTextLength, int count)
        {
            //Split by isolated point

            Node[] result = new Node[TopRecord];

            int lastRightBoundary = positionLenArr[0].Position + positionLenArr[0].Length;
            int lastIndex         = 0;

            for (int i = 1; i < count; i++)
            {
                if (positionLenArr[i].Position >= lastRightBoundary)
                {
                    //last is isolated point
                    int c = i - lastIndex;
                    微博舆论.Dict.PositionLength[] arr = new 微博舆论.Dict.PositionLength[c];
                    Array.Copy(positionLenArr, lastIndex, arr, 0, c);
                    Node[] leafNodeArray = GetLeafNodeArrayCore(arr, lastRightBoundary - positionLenArr[lastIndex].Position, c);
                    Framework.QuickSort <Node> .TopSort(leafNodeArray, _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer(_Options.FrequencyFirst));

                    CombineNodeArr(result, leafNodeArray);

                    lastIndex = i;
                }

                int newRightBoundary = positionLenArr[i].Position + positionLenArr[i].Length;

                if (newRightBoundary > lastRightBoundary)
                {
                    lastRightBoundary = newRightBoundary;
                }
            }

            if (lastIndex < count)
            {
                //last is isolated point
                int c = count - lastIndex;

                微博舆论.Dict.PositionLength[] arr = new 微博舆论.Dict.PositionLength[c];
                Array.Copy(positionLenArr, lastIndex, arr, 0, c);
                Node[] leafNodeArray = GetLeafNodeArrayCore(arr, lastRightBoundary - positionLenArr[lastIndex].Position, c);
                Framework.QuickSort <Node> .TopSort(leafNodeArray, _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer(_Options.FrequencyFirst));

                CombineNodeArr(result, leafNodeArray);
            }


            return(result);
        }
Beispiel #2
0
        public SuperLinkedList <WordInfo> Match(微博舆论.Dict.PositionLength[] positionLenArr, string orginalText, int count)
        {
            if (_Options == null)
            {
                _Options = Setting.PanGuSettings.Config.MatchOptions;
            }

            if (_Parameters == null)
            {
                _Parameters = Setting.PanGuSettings.Config.Parameters;
            }

            int[] masks      = new int[orginalText.Length];
            int   redundancy = _Parameters.Redundancy;

            SuperLinkedList <WordInfo> result = new SuperLinkedList <WordInfo>();

            if (count == 0)
            {
                if (_Options.UnknownWordIdentify)
                {
                    WordInfo wi = new WordInfo();
                    wi.Word     = orginalText;
                    wi.Position = 0;
                    wi.WordType = WordType.None;
                    wi.Rank     = 1;
                    result.AddFirst(wi);
                    return(result);
                }
                else
                {
                    int position = 0;
                    foreach (char c in orginalText)
                    {
                        WordInfo wi = new WordInfo();
                        wi.Word     = c.ToString();
                        wi.Position = position++;
                        wi.WordType = WordType.None;
                        wi.Rank     = 1;
                        result.AddLast(wi);
                    }

                    return(result);
                }
            }

            Node[] leafNodeArray = GetLeafNodeArray(positionLenArr, orginalText.Length, count);

            //下面两句是不采用孤立点分割算法的老算法
            //Node[] leafNodeArray = GetLeafNodeArrayCore(positionLenArr, orginalText.Length, count);
            //Framework.QuickSort<Node>.TopSort(leafNodeArray,
            //    _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer());

            int j = 0;

            // 获取前TopRecord个单词序列
            foreach (Node node in leafNodeArray)
            {
                if (leafNodeArray[j] == null)
                {
                    break;
                }

                if (j >= TopRecord || j >= leafNodeArray.Length)
                {
                    break;
                }

                Dict.PositionLength[] comb = new 微博舆论.Dict.PositionLength[node.AboveCount];

                int  i   = node.AboveCount - 1;
                Node cur = node;

                while (i >= 0)
                {
                    comb[i] = cur.PositionLength;
                    cur     = cur.Parent;
                    i--;
                }

                _AllCombinations.Add(comb);

                j++;
            }

            //Force single word
            //强制一元分词
            if (_Options.ForceSingleWord)
            {
                Dict.PositionLength[] comb = new 微博舆论.Dict.PositionLength[orginalText.Length];

                for (int i = 0; i < comb.Length; i++)
                {
                    微博舆论.Dict.PositionLength pl = new 微博舆论.Dict.PositionLength(i, 1, new WordAttribute(orginalText[i].ToString(), POS.POS_UNK, 0));
                    pl.Level = 3;
                    comb[i]  = pl;
                }

                _AllCombinations.Add(comb);
            }

            if (_AllCombinations.Count > 0)
            {
                ICollection <Dict.PositionLength> positionCollection = MergeAllCombinations(redundancy);

                foreach (Dict.PositionLength pl in positionCollection)
                //for (int i = 0; i < _AllCombinations[0].Length; i++)
                {
                    //result.AddLast(new WordInfo(_AllCombinations[0][i], orginalText));
                    result.AddLast(new WordInfo(pl, orginalText, _Parameters));
                    if (pl.Length > 1)
                    {
                        for (int k = pl.Position;
                             k < pl.Position + pl.Length; k++)
                        {
                            masks[k] = 2;
                        }
                    }
                    else
                    {
                        masks[pl.Position] = 1;
                    }
                }
            }

            #region 合并未登录词

            bool            needRemoveSingleWord;
            List <WordInfo> unknownWords = GetUnknowWords(masks, orginalText, out needRemoveSingleWord);

            //合并到结果序列的对应位置中
            if (unknownWords.Count > 0)
            {
                SuperLinkedListNode <WordInfo> cur = result.First;

                if (needRemoveSingleWord && !_Options.ForceSingleWord)
                {
                    //Remove single word need be remvoed

                    while (cur != null)
                    {
                        if (cur.Value.Word.Length == 1)
                        {
                            if (masks[cur.Value.Position] == 11)
                            {
                                SuperLinkedListNode <WordInfo> removeItem = cur;

                                cur = cur.Next;

                                result.Remove(removeItem);

                                continue;
                            }
                        }

                        cur = cur.Next;
                    }
                }

                cur = result.First;

                j = 0;

                while (cur != null)
                {
                    if (cur.Value.Position >= unknownWords[j].Position)
                    {
                        result.AddBefore(cur, unknownWords[j]);
                        j++;
                        if (j >= unknownWords.Count)
                        {
                            break;
                        }
                    }

                    if (cur.Value.Position < unknownWords[j].Position)
                    {
                        cur = cur.Next;
                    }
                }

                while (j < unknownWords.Count)
                {
                    result.AddLast(unknownWords[j]);
                    j++;
                }
            }


            #endregion



            return(result);
        }