public WordInfo(Dict.PositionLength pl, string oringinalText, Match.MatchParameter parameters) { this.Word = oringinalText.Substring(pl.Position, pl.Length); this.Pos = pl.WordAttr.Pos; this.Frequency = pl.WordAttr.Frequency; this.WordType = WordType.SimplifiedChinese; this.Position = pl.Position; switch (pl.Level) { case 0: this.Rank = parameters.BestRank; break; case 1: this.Rank = parameters.SecRank; break; case 2: this.Rank = parameters.ThirdRank; break; case 3: this.Rank = parameters.SingleRank; break; default: this.Rank = parameters.BestRank; break; } }
public MatchParameter Clone() { MatchParameter result = new MatchParameter(); foreach (FieldInfo fi in this.GetType().GetFields()) { object value = fi.GetValue(this); fi.SetValue(result, value); } return(result); }
public ICollection <WordInfo> DoSegment(string text, Match.MatchOptions options, Match.MatchParameter parameters) { if (string.IsNullOrEmpty(text)) { return(new SuperLinkedList <WordInfo>()); } try { Dict.DictionaryLoader.Lock.Enter(微博舆论.Framework.Lock.Mode.Share); _Options = options; _Parameters = parameters; Init(); if (_Options == null) { _Options = Setting.PanGuSettings.Config.MatchOptions; } if (_Parameters == null) { _Parameters = Setting.PanGuSettings.Config.Parameters; } SuperLinkedList <WordInfo> result = PreSegment(text); if (_Options.FilterStopWords) { FilterStopWord(result); } ProcessAfterSegment(text, result); return(result); } finally { Dict.DictionaryLoader.Lock.Leave(); } }
public SuperLinkedList <WordInfo> Match(微博舆论.Dict.PositionLength[] positionLenArr, string orginalText, int count) { if (_Options == null) { _Options = Setting.PanGuSettings.Config.MatchOptions; } if (_Parameters == null) { _Parameters = Setting.PanGuSettings.Config.Parameters; } int[] masks = new int[orginalText.Length]; int redundancy = _Parameters.Redundancy; SuperLinkedList <WordInfo> result = new SuperLinkedList <WordInfo>(); if (count == 0) { if (_Options.UnknownWordIdentify) { WordInfo wi = new WordInfo(); wi.Word = orginalText; wi.Position = 0; wi.WordType = WordType.None; wi.Rank = 1; result.AddFirst(wi); return(result); } else { int position = 0; foreach (char c in orginalText) { WordInfo wi = new WordInfo(); wi.Word = c.ToString(); wi.Position = position++; wi.WordType = WordType.None; wi.Rank = 1; result.AddLast(wi); } return(result); } } Node[] leafNodeArray = GetLeafNodeArray(positionLenArr, orginalText.Length, count); //下面两句是不采用孤立点分割算法的老算法 //Node[] leafNodeArray = GetLeafNodeArrayCore(positionLenArr, orginalText.Length, count); //Framework.QuickSort<Node>.TopSort(leafNodeArray, // _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer()); int j = 0; // 获取前TopRecord个单词序列 foreach (Node node in leafNodeArray) { if (leafNodeArray[j] == null) { break; } if (j >= TopRecord || j >= leafNodeArray.Length) { break; } Dict.PositionLength[] comb = new 微博舆论.Dict.PositionLength[node.AboveCount]; int i = node.AboveCount - 1; Node cur = node; while (i >= 0) { comb[i] = cur.PositionLength; cur = cur.Parent; i--; } _AllCombinations.Add(comb); j++; } //Force single word //强制一元分词 if (_Options.ForceSingleWord) { Dict.PositionLength[] comb = new 微博舆论.Dict.PositionLength[orginalText.Length]; for (int i = 0; i < comb.Length; i++) { 微博舆论.Dict.PositionLength pl = new 微博舆论.Dict.PositionLength(i, 1, new WordAttribute(orginalText[i].ToString(), POS.POS_UNK, 0)); pl.Level = 3; comb[i] = pl; } _AllCombinations.Add(comb); } if (_AllCombinations.Count > 0) { ICollection <Dict.PositionLength> positionCollection = MergeAllCombinations(redundancy); foreach (Dict.PositionLength pl in positionCollection) //for (int i = 0; i < _AllCombinations[0].Length; i++) { //result.AddLast(new WordInfo(_AllCombinations[0][i], orginalText)); result.AddLast(new WordInfo(pl, orginalText, _Parameters)); if (pl.Length > 1) { for (int k = pl.Position; k < pl.Position + pl.Length; k++) { masks[k] = 2; } } else { masks[pl.Position] = 1; } } } #region 合并未登录词 bool needRemoveSingleWord; List <WordInfo> unknownWords = GetUnknowWords(masks, orginalText, out needRemoveSingleWord); //合并到结果序列的对应位置中 if (unknownWords.Count > 0) { SuperLinkedListNode <WordInfo> cur = result.First; if (needRemoveSingleWord && !_Options.ForceSingleWord) { //Remove single word need be remvoed while (cur != null) { if (cur.Value.Word.Length == 1) { if (masks[cur.Value.Position] == 11) { SuperLinkedListNode <WordInfo> removeItem = cur; cur = cur.Next; result.Remove(removeItem); continue; } } cur = cur.Next; } } cur = result.First; j = 0; while (cur != null) { if (cur.Value.Position >= unknownWords[j].Position) { result.AddBefore(cur, unknownWords[j]); j++; if (j >= unknownWords.Count) { break; } } if (cur.Value.Position < unknownWords[j].Position) { cur = cur.Next; } } while (j < unknownWords.Count) { result.AddLast(unknownWords[j]); j++; } } #endregion return(result); }
public ICollection<WordInfo> DoSegment(string text, Match.MatchOptions options, Match.MatchParameter parameters) { if (string.IsNullOrEmpty(text)) { return new SuperLinkedList<WordInfo>(); } try { Dict.DictionaryLoader.Lock.Enter(微博舆论.Framework.Lock.Mode.Share); _Options = options; _Parameters = parameters; Init(); if (_Options == null) { _Options = Setting.PanGuSettings.Config.MatchOptions; } if (_Parameters == null) { _Parameters = Setting.PanGuSettings.Config.Parameters; } SuperLinkedList<WordInfo> result = PreSegment(text); if (_Options.FilterStopWords) { FilterStopWord(result); } ProcessAfterSegment(text, result); return result; } finally { Dict.DictionaryLoader.Lock.Leave(); } }
public MatchParameter Clone() { MatchParameter result = new MatchParameter(); foreach (FieldInfo fi in this.GetType().GetFields()) { object value = fi.GetValue(this); fi.SetValue(result, value); } return result; }