private void FilterStopWord(SuperLinkedList <WordInfo> wordInfoList) { if (wordInfoList == null) { return; } SuperLinkedListNode <WordInfo> cur = wordInfoList.First; while (cur != null) { if (_StopWord.IsStopWord(cur.Value.Word, _Options.FilterEnglish, _Parameters.FilterEnglishLength, _Options.FilterNumeric, _Parameters.FilterNumericLength)) { SuperLinkedListNode <WordInfo> removeItem = cur; cur = cur.Next; wordInfoList.Remove(removeItem); } else { cur = cur.Next; } } }
/// <summary> /// 提取版本号 /// </summary> /// <param name="result">盘古分词的结果</param> /// <param name="vWordNode">V 这个字符的第一个出现位置</param> /// <param name="lastNode">版本号的最后一个词</param> /// <param name="versionBeginPosition">版本号第一个词的起始位置</param> private void Pickup(SuperLinkedList<WordInfo> result, SuperLinkedListNode<WordInfo> vWordNode, SuperLinkedListNode<WordInfo> lastNode, int versionBeginPosition) { SuperLinkedListNode<WordInfo> node = vWordNode.Next; int lastPosition = lastNode.Value.Position + lastNode.Value.Word.Length; SuperLinkedListNode<WordInfo> end = lastNode.Next; while (node != end) { result.Remove(node); node = vWordNode.Next; } if (vWordNode.Value.Word == "V") { vWordNode.Value.Word = "v"; } string version = _Text.Substring(versionBeginPosition, lastPosition - versionBeginPosition); int dotPosition = 0; int dotCount = 0; WordInfo verWord = null; dotPosition = version.IndexOf('.', dotPosition); while (dotPosition > 0) { verWord = null; if (dotCount > 0) //第一个点之前的版本号不提取 { //提取前n个子版本号 verWord = new WordInfo(version.Substring(0, dotPosition), POS.POS_D_K, 0); verWord.Rank = 1; //这里设置子版本号的权重 verWord.Position = versionBeginPosition; verWord.WordType = WordType.None; } dotCount++; dotPosition = version.IndexOf('.', dotPosition + 1); if (verWord != null) { result.AddAfter(vWordNode, verWord); } } //提取完整版本号 verWord = new WordInfo(version, POS.POS_D_K, 0); verWord.Rank = 5; //这里设置完整版本号的权重 verWord.Position = versionBeginPosition; verWord.WordType = WordType.None; result.AddAfter(vWordNode, verWord); }
/// <summary> /// 提取版本号 /// </summary> /// <param name="result">盘古分词的结果</param> /// <param name="vWordNode">V 这个字符的第一个出现位置</param> /// <param name="lastNode">版本号的最后一个词</param> /// <param name="versionBeginPosition">版本号第一个词的起始位置</param> private void Pickup(SuperLinkedList <WordInfo> result, SuperLinkedListNode <WordInfo> vWordNode, SuperLinkedListNode <WordInfo> lastNode, int versionBeginPosition) { SuperLinkedListNode <WordInfo> node = vWordNode.Next; int lastPosition = lastNode.Value.Position + lastNode.Value.Word.Length; SuperLinkedListNode <WordInfo> end = lastNode.Next; while (node != end) { result.Remove(node); node = vWordNode.Next; } if (vWordNode.Value.Word == "V") { vWordNode.Value.Word = "v"; } string version = _Text.Substring(versionBeginPosition, lastPosition - versionBeginPosition); int dotPosition = 0; int dotCount = 0; WordInfo verWord = null; dotPosition = version.IndexOf('.', dotPosition); while (dotPosition > 0) { verWord = null; if (dotCount > 0) //第一个点之前的版本号不提取 { //提取前n个子版本号 verWord = new WordInfo(version.Substring(0, dotPosition), POS.POS_D_K, 0); verWord.Rank = 1; //这里设置子版本号的权重 verWord.Position = versionBeginPosition; verWord.WordType = WordType.None; } dotCount++; dotPosition = version.IndexOf('.', dotPosition + 1); if (verWord != null) { result.AddAfter(vWordNode, verWord); } } //提取完整版本号 verWord = new WordInfo(version, POS.POS_D_K, 0); verWord.Rank = 5; //这里设置完整版本号的权重 verWord.Position = versionBeginPosition; verWord.WordType = WordType.None; result.AddAfter(vWordNode, verWord); }
private SuperLinkedList<WordInfo> PreSegment(String text) { SuperLinkedList<WordInfo> result = GetInitSegment(text); SuperLinkedListNode<WordInfo> cur = result.First; this.ActionSegment(result, cur, text); return result; }
public void AfterSegment(SuperLinkedList <WordInfo> result) { SuperLinkedListNode <WordInfo> node = result.First; while (node != null) { if (node.Value.WordType == WordType.English) { int position = node.Value.Word.IndexOf("Nokia", 0, StringComparison.CurrentCultureIgnoreCase); if (position >= 0 && !node.Value.Word.Equals("Nokia", StringComparison.CurrentCultureIgnoreCase)) { WordInfo wordinfo = new WordInfo("Nokia", node.Value.Position + position, node.Value.Pos, node.Value.Frequency, node.Value.Rank, node.Value.WordType, node.Value.OriginalWordType); node = result.AddAfter(node, wordinfo); } } node = node.Next; } }
public SuperLinkedList <WordInfo> Match(PanGu.Dict.PositionLength[] positionLenArr, string orginalText, int count) { if (_Options == null) { _Options = Setting.PanGuSettings.Config.MatchOptions; } if (_Parameters == null) { _Parameters = Setting.PanGuSettings.Config.Parameters; } int[] masks = new int[orginalText.Length]; int redundancy = _Parameters.Redundancy; SuperLinkedList <WordInfo> result = new SuperLinkedList <WordInfo>(); if (count == 0) { if (_Options.UnknownWordIdentify) { WordInfo wi = new WordInfo(); wi.Word = orginalText; wi.Position = 0; wi.WordType = WordType.None; wi.Rank = 1; result.AddFirst(wi); return(result); } else { int position = 0; foreach (char c in orginalText) { WordInfo wi = new WordInfo(); wi.Word = c.ToString(); wi.Position = position++; wi.WordType = WordType.None; wi.Rank = 1; result.AddLast(wi); } return(result); } } Node[] leafNodeArray = GetLeafNodeArray(positionLenArr, orginalText.Length, count); //下面两句是不采用孤立点分割算法的老算法 //Node[] leafNodeArray = GetLeafNodeArrayCore(positionLenArr, orginalText.Length, count); //Framework.QuickSort<Node>.TopSort(leafNodeArray, // _LeafNodeList.Count, (int)Math.Min(TopRecord, _LeafNodeList.Count), new NodeComparer()); int j = 0; // 获取前TopRecord个单词序列 foreach (Node node in leafNodeArray) { if (leafNodeArray[j] == null) { break; } if (j >= TopRecord || j >= leafNodeArray.Length) { break; } Dict.PositionLength[] comb = new PanGu.Dict.PositionLength[node.AboveCount]; int i = node.AboveCount - 1; Node cur = node; while (i >= 0) { comb[i] = cur.PositionLength; cur = cur.Parent; i--; } _AllCombinations.Add(comb); j++; } //Force single word //强制一元分词 if (_Options.ForceSingleWord) { Dict.PositionLength[] comb = new PanGu.Dict.PositionLength[orginalText.Length]; for (int i = 0; i < comb.Length; i++) { PanGu.Dict.PositionLength pl = new PanGu.Dict.PositionLength(i, 1, new WordAttribute(orginalText[i].ToString(), POS.POS_UNK, 0)); pl.Level = 3; comb[i] = pl; } _AllCombinations.Add(comb); } if (_AllCombinations.Count > 0) { ICollection <Dict.PositionLength> positionCollection = MergeAllCombinations(redundancy); foreach (Dict.PositionLength pl in positionCollection) //for (int i = 0; i < _AllCombinations[0].Length; i++) { //result.AddLast(new WordInfo(_AllCombinations[0][i], orginalText)); result.AddLast(new WordInfo(pl, orginalText, _Parameters)); if (pl.Length > 1) { for (int k = pl.Position; k < pl.Position + pl.Length; k++) { masks[k] = 2; } } else { masks[pl.Position] = 1; } } } #region 合并未登录词 bool needRemoveSingleWord; List <WordInfo> unknownWords = GetUnknowWords(masks, orginalText, out needRemoveSingleWord); //合并到结果序列的对应位置中 if (unknownWords.Count > 0) { SuperLinkedListNode <WordInfo> cur = result.First; if (needRemoveSingleWord && !_Options.ForceSingleWord) { //Remove single word need be remvoed while (cur != null) { if (cur.Value.Word.Length == 1) { if (masks[cur.Value.Position] == 11) { SuperLinkedListNode <WordInfo> removeItem = cur; cur = cur.Next; result.Remove(removeItem); continue; } } cur = cur.Next; } } cur = result.First; j = 0; while (cur != null) { if (cur.Value.Position >= unknownWords[j].Position) { result.AddBefore(cur, unknownWords[j]); j++; if (j >= unknownWords.Count) { break; } } if (cur.Value.Position < unknownWords[j].Position) { cur = cur.Next; } } while (j < unknownWords.Count) { result.AddLast(unknownWords[j]); j++; } } #endregion return(result); }
private void ProcessAfterSegment(string orginalText, SuperLinkedList <WordInfo> result) { //匹配同义词 if (_Options.SynonymOutput) { SuperLinkedListNode <WordInfo> node = result.First; while (node != null) { List <string> synonyms = _Synonym.GetSynonyms(node.Value.Word); if (synonyms != null) { foreach (string word in synonyms) { node = result.AddAfter(node, new WordInfo(word, node.Value.Position, node.Value.Pos, node.Value.Frequency, _Parameters.SymbolRank, WordType.Synonym, node.Value.WordType)); } } node = node.Next; } } //通配符匹配 if (_Options.WildcardOutput) { SuperLinkedListNode <WordInfo> node = result.First; while (node != null) { List <Dict.Wildcard.WildcardInfo> wildcards = _Wildcard.GetWildcards(node.Value.Word); if (wildcards.Count > 0) { for (int i = 0; i < wildcards.Count; i++) { Dict.Wildcard.WildcardInfo wildcardInfo = wildcards[i]; int count = wildcardInfo.Segments.Count; if (!_Options.WildcardSegment) { count = 1; } for (int j = 0; j < count; j++) { WordInfo wi = wildcardInfo.Segments[j]; if (wi.Word == node.Value.Word) { continue; } wi.Rank = _Parameters.WildcardRank; wi.Position += node.Value.Position; result.AddBefore(node, wi); } } } node = node.Next; if (node != null) { //过滤英文分词时多元分词重复输出的问题 if (node.Previous.Value.Word.ToLower() == node.Value.Word.ToLower()) { node = node.Next; } } } } //用户自定义规则 //if (_Options.CustomRule) //{ // ICustomRule rule = CustomRule.GetCustomRule(_Parameters.CustomRuleAssemblyFileName, // _Parameters.CustomRuleFullClassName); // if (rule != null) // { // rule.Text = orginalText; // rule.AfterSegment(result); // } //} }
private bool MergeEnglishSpecialWord(string orginalText, SuperLinkedList <WordInfo> wordInfoList, ref SuperLinkedListNode <WordInfo> current) { SuperLinkedListNode <WordInfo> cur = current; cur = cur.Next; int last = -1; while (cur != null) { if (cur.Value.WordType == WordType.Symbol || cur.Value.WordType == WordType.English) { last = cur.Value.Position + cur.Value.Word.Length; cur = cur.Next; } else { break; } } if (last >= 0) { int first = current.Value.Position; string newWord = orginalText.Substring(first, last - first); WordAttribute wa = _WordDictionary.GetWordAttr(newWord); if (wa == null) { return(false); } while (current != cur) { SuperLinkedListNode <WordInfo> removeItem = current; current = current.Next; wordInfoList.Remove(removeItem); } WordInfo newWordInfo = new WordInfo(new Dict.PositionLength(first, last - first, wa), orginalText, _Parameters); newWordInfo.WordType = WordType.English; newWordInfo.Rank = _Parameters.EnglishRank; if (_Options.EnglishSegment) { string lowerWord = newWordInfo.Word.ToLower(); if (lowerWord != newWordInfo.Word) { if (current == null) { wordInfoList.AddLast(newWordInfo); } else { wordInfoList.AddBefore(current, newWordInfo); } } newWordInfo = new WordInfo(lowerWord, newWordInfo.Position, newWordInfo.Pos, newWordInfo.Frequency, _Parameters.EnglishLowerRank, newWordInfo.WordType, newWordInfo.OriginalWordType); } else if (_Options.IgnoreCapital) { newWordInfo.Word = newWordInfo.Word.ToLower(); } if (current == null) { wordInfoList.AddLast(newWordInfo); } else { wordInfoList.AddBefore(current, newWordInfo); } return(true); } return(false); }
private SuperLinkedList <WordInfo> PreSegment(String text) { try { SuperLinkedList <WordInfo> result = GetInitSegment(text); SuperLinkedListNode <WordInfo> cur = result.First; while (cur != null) { if (_Options.IgnoreSpace) { if (cur.Value.WordType == WordType.Space) { SuperLinkedListNode <WordInfo> lst = cur; cur = cur.Next; result.Remove(lst); continue; } } switch (cur.Value.WordType) { case WordType.SimplifiedChinese: string inputText = cur.Value.Word; WordType originalWordType = WordType.SimplifiedChinese; //_Options.TraditionalChineseEnabled = true; if (_Options.TraditionalChineseEnabled) { string simplified = WordDictionary.ToSimlifiedChinese(cur.Value.Word); if (simplified != cur.Value.Word) { originalWordType = WordType.TraditionalChinese; inputText = simplified; } } Framework.AppendList <Dict.PositionLength> pls = _WordDictionary.GetAllMatchs(inputText, _Options.ChineseNameIdentify); Match.ChsFullTextMatch chsMatch = new Match.ChsFullTextMatch(_WordDictionary); chsMatch.Options = _Options; chsMatch.Parameters = _Parameters; SuperLinkedList <WordInfo> chsMatchWords = chsMatch.Match(pls.Items, cur.Value.Word, pls.Count); SuperLinkedListNode <WordInfo> curChsMatch = chsMatchWords.First; while (curChsMatch != null) { WordInfo wi = curChsMatch.Value; wi.Position += cur.Value.Position; wi.OriginalWordType = originalWordType; wi.WordType = originalWordType; if (_Options.OutputSimplifiedTraditional) { if (_Options.TraditionalChineseEnabled) { string newWord; WordType wt; if (originalWordType == WordType.SimplifiedChinese) { newWord = WordDictionary.ToTraditionalChinese(wi.Word); wt = WordType.TraditionalChinese; } else { newWord = WordDictionary.ToSimlifiedChinese(wi.Word); wt = WordType.SimplifiedChinese; } if (newWord != wi.Word) { WordInfo newWordInfo = new WordInfo(wi); newWordInfo.Word = newWord; newWordInfo.OriginalWordType = originalWordType; newWordInfo.WordType = wt; newWordInfo.Rank = _Parameters.SimplifiedTraditionalRank; newWordInfo.Position = wi.Position; chsMatchWords.AddBefore(curChsMatch, newWordInfo); } } } curChsMatch = curChsMatch.Next; } SuperLinkedListNode <WordInfo> lst = result.AddAfter(cur, chsMatchWords); SuperLinkedListNode <WordInfo> removeItem = cur; cur = lst.Next; result.Remove(removeItem); break; case WordType.English: cur.Value.Rank = _Parameters.EnglishRank; List <string> output; cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word); if (_Options.EnglishSegment) { string lower = cur.Value.Word.ToLower(); if (lower != cur.Value.Word) { result.AddBefore(cur, new WordInfo(lower, cur.Value.Position, POS.POS_A_NX, 1, _Parameters.EnglishLowerRank, WordType.English, WordType.English)); } string stem = GetStem(lower); if (!string.IsNullOrEmpty(stem)) { if (lower != stem) { result.AddBefore(cur, new WordInfo(stem, cur.Value.Position, POS.POS_A_NX, 1, _Parameters.EnglishStemRank, WordType.English, WordType.English)); } } } else if (_Options.IgnoreCapital) { cur.Value.Word = cur.Value.Word.ToLower(); } if (_Options.EnglishMultiDimensionality) { bool needSplit = false; foreach (char c in cur.Value.Word) { if ((c >= '0' && c <= '9') || (c == '_')) { needSplit = true; break; } } if (needSplit) { if (Framework.Regex.GetMatchStrings(cur.Value.Word, PATTERNS, true, out output)) { int outputCount = 0; foreach (string str in output) { if (!string.IsNullOrEmpty(str)) { outputCount++; if (outputCount > 1) { break; } } } if (outputCount > 1) { int position = cur.Value.Position; foreach (string splitWord in output) { if (string.IsNullOrEmpty(splitWord)) { continue; } WordInfo wi; if (splitWord[0] >= '0' && splitWord[0] <= '9') { wi = new WordInfo(splitWord, POS.POS_A_M, 1); wi.Position = position; wi.Rank = _Parameters.NumericRank; wi.OriginalWordType = WordType.English; wi.WordType = WordType.Numeric; } else { wi = new WordInfo(splitWord, POS.POS_A_NX, 1); wi.Position = position; wi.Rank = _Parameters.EnglishRank; wi.OriginalWordType = WordType.English; wi.WordType = WordType.English; } result.AddBefore(cur, wi); position += splitWord.Length; } } } } } if (!MergeEnglishSpecialWord(text, result, ref cur)) { cur = cur.Next; } break; case WordType.Numeric: cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word); cur.Value.Rank = _Parameters.NumericRank; if (!MergeEnglishSpecialWord(text, result, ref cur)) { cur = cur.Next; } //cur = cur.Next; break; case WordType.Symbol: cur.Value.Rank = _Parameters.SymbolRank; cur = cur.Next; break; default: cur = cur.Next; break; } } return(result); } catch (Exception) { throw; } }
/// <summary> /// 合并英文专用词。 /// 如果字典中有英文专用词如U.S.A, C++.C#等 /// 需要对初步分词后的英文和字母进行合并 /// </summary> /// <param name="words"></param> /// <param name="start"></param> /// <param name="end"></param> /// <returns></returns> //private String MergeEnglishSpecialWord(CExtractWords extractWords, ArrayList words, int start, ref int end) //{ // StringBuilder str = new StringBuilder(); // int i; // for (i = start; i < words.Count; i++) // { // string word = (string)words[i]; // //word 为空或者为空格回车换行等分割符号,中断扫描 // if (word.Trim() == "") // { // break; // } // //如果遇到中文,中断扫描 // if (word[0] >= 0x4e00 && word[0] <= 0x9fa5) // { // break; // } // str.Append(word); // } // String mergeString = str.ToString(); // List<T_WordInfo> exWords = extractWords.ExtractFullText(mergeString); // if (exWords.Count == 1) // { // T_WordInfo info = (T_WordInfo)exWords[0]; // if (info.Word.Length == mergeString.Length) // { // end = i; // return mergeString; // } // } // return null; //} private bool MergeEnglishSpecialWord(string orginalText, SuperLinkedList<WordInfo> wordInfoList, ref SuperLinkedListNode<WordInfo> current) { SuperLinkedListNode<WordInfo> cur = current; cur = cur.Next; int last = -1; while (cur != null) { if (cur.Value.WordType == WordType.Symbol || cur.Value.WordType == WordType.English) { last = cur.Value.Position + cur.Value.Word.Length; cur = cur.Next; } else { break; } } if (last >= 0) { int first = current.Value.Position; string newWord = orginalText.Substring(first, last - first); WordAttribute wa = _WordDictionary.GetWordAttr(newWord); if (wa == null) { return false; } while (current != cur) { SuperLinkedListNode<WordInfo> removeItem = current; current = current.Next; wordInfoList.Remove(removeItem); } WordInfo newWordInfo = new WordInfo(new PanGu.Dict.PositionLength(first, last - first, wa), orginalText, _Parameters); newWordInfo.WordType = WordType.English; newWordInfo.Rank = _Parameters.EnglishRank; if (current == null) { wordInfoList.AddLast(newWordInfo); } else { wordInfoList.AddBefore(current, newWordInfo); } return true; } return false; }
public void AfterSegment(SuperLinkedList <WordInfo> result) { SuperLinkedListNode <WordInfo> node = result.First; SuperLinkedListNode <WordInfo> vWordNode = null; SuperLinkedListNode <WordInfo> lastNode = null; bool isVersion = false; int versionBeginPosition = -1; while (node != null) { if (vWordNode == null) { if (node.Value.WordType == WordType.English) { //匹配 V 这个字符,作为版本号的开始 if (node.Value.Word.Length == 1) { if (node.Value.Word[0] == 'v' || node.Value.Word[0] == 'V') { vWordNode = node; lastNode = node; } } } } else if (vWordNode != null) { //如果V有多元分词情况,忽略,跳到下一个 if (node.Value.Position == vWordNode.Value.Position) { node = node.Next; continue; } //匹配数字或点 if (node.Value.WordType == WordType.Numeric || node.Value.Word == ".") { if (node.Value.Position - (lastNode.Value.Position + lastNode.Value.Word.Length) <= 1) { if (versionBeginPosition < 0) { versionBeginPosition = node.Value.Position; } isVersion = true; lastNode = node; node = node.Next; continue; } } if (isVersion) { //如果是版本号,提取版本号 Pickup(result, vWordNode, lastNode, versionBeginPosition); vWordNode = null; lastNode = null; versionBeginPosition = -1; isVersion = false; continue; } } node = node.Next; } if (isVersion) { //如果是版本号,提取版本号 Pickup(result, vWordNode, lastNode, versionBeginPosition); } }