/// <summary> ///根据孤立点拆分长句,然后再分别对各个句子的片段进行分词. ///长中文句子的分词困扰了我3年,一直没有好的解决方案。没想到在观看 ///2010年世界杯开幕式时,我突发灵感,想出了这个孤立点分割拆分长句的 ///算法,彻底解决的这个长期困扰我的难题. ///eaglet 11th Jun 2010 注释留念 /// </summary> /// <param name="positionLenArr">保护位置和长度信息的单词分量数组</param> /// <param name="orginalTextLength">原始字符串长度</param> /// <param name="count">positionLenArr 的 count</param> /// <returns></returns> private Node[] GetLeafNodeArray(PanGu.Dict.PositionLength[] positionLenArr, int orginalTextLength, int count) { //Split by isolated point Node[] result = new Node[TopRecord]; int lastRightBoundary = positionLenArr[0].Position + positionLenArr[0].Length; int lastIndex = 0; for (int i = 1; i < count; i++) { if (positionLenArr[i].Position >= lastRightBoundary) { //last is isolated point int c = i - lastIndex; PanGu.Dict.PositionLength[] arr = new PanGu.Dict.PositionLength[c]; Array.Copy(positionLenArr, lastIndex, arr, 0, c); Node[] leafNodeArray = GetLeafNodeArrayCore(arr, lastRightBoundary - positionLenArr[lastIndex].Position, c); Framework.QuickSort <Node> .TopSort(leafNodeArray, _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer(_Options.FrequencyFirst)); CombineNodeArr(result, leafNodeArray); lastIndex = i; } int newRightBoundary = positionLenArr[i].Position + positionLenArr[i].Length; if (newRightBoundary > lastRightBoundary) { lastRightBoundary = newRightBoundary; } } if (lastIndex < count) { //last is isolated point int c = count - lastIndex; PanGu.Dict.PositionLength[] arr = new PanGu.Dict.PositionLength[c]; Array.Copy(positionLenArr, lastIndex, arr, 0, c); Node[] leafNodeArray = GetLeafNodeArrayCore(arr, lastRightBoundary - positionLenArr[lastIndex].Position, c); Framework.QuickSort <Node> .TopSort(leafNodeArray, _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer(_Options.FrequencyFirst)); CombineNodeArr(result, leafNodeArray); } return(result); }
public SuperLinkedList <WordInfo> Match(PanGu.Dict.PositionLength[] positionLenArr, string orginalText, int count) { if (_Options == null) { _Options = Setting.PanGuSettings.Config.MatchOptions; } if (_Parameters == null) { _Parameters = Setting.PanGuSettings.Config.Parameters; } int[] masks = new int[orginalText.Length]; int redundancy = _Parameters.Redundancy; SuperLinkedList <WordInfo> result = new SuperLinkedList <WordInfo>(); if (count == 0) { if (_Options.UnknownWordIdentify) { WordInfo wi = new WordInfo(); wi.Word = orginalText; wi.Position = 0; wi.WordType = WordType.None; wi.Rank = 1; result.AddFirst(wi); return(result); } else { int position = 0; foreach (char c in orginalText) { WordInfo wi = new WordInfo(); wi.Word = c.ToString(); wi.Position = position++; wi.WordType = WordType.None; wi.Rank = 1; result.AddLast(wi); } return(result); } } Node[] leafNodeArray = GetLeafNodeArray(positionLenArr, orginalText.Length, count); //下面两句是不采用孤立点分割算法的老算法 //Node[] leafNodeArray = GetLeafNodeArrayCore(positionLenArr, orginalText.Length, count); //Framework.QuickSort<Node>.TopSort(leafNodeArray, // _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer()); int j = 0; // 获取前TopRecord个单词序列 foreach (Node node in leafNodeArray) { if (leafNodeArray[j] == null) { break; } if (j >= TopRecord || j >= leafNodeArray.Length) { break; } Dict.PositionLength[] comb = new PanGu.Dict.PositionLength[node.AboveCount]; int i = node.AboveCount - 1; Node cur = node; while (i >= 0) { comb[i] = cur.PositionLength; cur = cur.Parent; i--; } _AllCombinations.Add(comb); j++; } //Force single word //强制一元分词 if (_Options.ForceSingleWord) { Dict.PositionLength[] comb = new PanGu.Dict.PositionLength[orginalText.Length]; for (int i = 0; i < comb.Length; i++) { PanGu.Dict.PositionLength pl = new PanGu.Dict.PositionLength(i, 1, new WordAttribute(orginalText[i].ToString(), POS.POS_UNK, 0)); pl.Level = 3; comb[i] = pl; } _AllCombinations.Add(comb); } if (_AllCombinations.Count > 0) { ICollection <Dict.PositionLength> positionCollection = MergeAllCombinations(redundancy); foreach (Dict.PositionLength pl in positionCollection) //for (int i = 0; i < _AllCombinations[0].Length; i++) { //result.AddLast(new WordInfo(_AllCombinations[0][i], orginalText)); result.AddLast(new WordInfo(pl, orginalText, _Parameters)); if (pl.Length > 1) { for (int k = pl.Position; k < pl.Position + pl.Length; k++) { masks[k] = 2; } } else { masks[pl.Position] = 1; } } } #region 合并未登录词 bool needRemoveSingleWord; List <WordInfo> unknownWords = GetUnknowWords(masks, orginalText, out needRemoveSingleWord); //合并到结果序列的对应位置中 if (unknownWords.Count > 0) { SuperLinkedListNode <WordInfo> cur = result.First; if (needRemoveSingleWord && !_Options.ForceSingleWord) { //Remove single word need be remvoed while (cur != null) { if (cur.Value.Word.Length == 1) { if (masks[cur.Value.Position] == 11) { SuperLinkedListNode <WordInfo> removeItem = cur; cur = cur.Next; result.Remove(removeItem); continue; } } cur = cur.Next; } } cur = result.First; j = 0; while (cur != null) { if (cur.Value.Position >= unknownWords[j].Position) { result.AddBefore(cur, unknownWords[j]); j++; if (j >= unknownWords.Count) { break; } } if (cur.Value.Position < unknownWords[j].Position) { cur = cur.Next; } } while (j < unknownWords.Count) { result.AddLast(unknownWords[j]); j++; } } #endregion return(result); }