public Framework.AppendList<PositionLength> GetAllMatchs(string text, bool chineseNameIdentify) { Framework.AppendList<PositionLength> result = new PanGu.Framework.AppendList<PositionLength>(); if (text == null && text == "") { return result; } string keyText = text; if (text[0] < 128) { keyText = keyText.ToLower(); } for (int i = 0; i < text.Length; i++) { byte[] lenList; char fst = keyText[i]; List<string> chsNames = null; if (chineseNameIdentify) { chsNames = ChineseName.Match(text, i); if (chsNames != null) { foreach (string name in chsNames) { WordAttribute wa = new WordAttribute(name, POS.POS_A_NR, 0); result.Add(new PositionLength(i, name.Length, wa)); } } } WordAttribute fwa; if (_FirstCharDict.TryGetValue(fst, out fwa)) { result.Add(new PositionLength(i, 1, fwa)); } if (i < keyText.Length - 1) { uint doubleChar = ((uint)keyText[i] * 65536) + keyText[i+1]; if (_DoubleCharDict.TryGetValue(doubleChar, out fwa)) { result.Add(new PositionLength(i, 2, fwa)); } } if (i >= keyText.Length - 2) { continue; } long tripleChar = ((long)keyText[i]) * 0x100000000 + (uint)(keyText[i + 1] * 65536) + keyText[i+2]; if (_TripleCharDict.TryGetValue(tripleChar, out lenList)) { foreach (byte len in lenList) { if (len == 0) { break; } if (i + len > keyText.Length) { continue; } string key = keyText.Substring(i, len); WordAttribute wa; if (_WordDict.TryGetValue(key, out wa)) { if (chsNames != null) { bool find = false; foreach (string name in chsNames) { if (wa.Word == name) { find = true; break; } } if (find) { continue; } } result.Add(new PositionLength(i, len, wa)); } } } } return result; }
public Framework.AppendList <PositionLength> GetAllMatchs(string text, bool chineseNameIdentify) { Framework.AppendList <PositionLength> result = new PanGu.Framework.AppendList <PositionLength>(); if (text == null && text == "") { return(result); } string keyText = text; if (text[0] < 128) { keyText = keyText.ToLower(); } for (int i = 0; i < text.Length; i++) { byte[] lenList; char fst = keyText[i]; List <string> chsNames = null; if (chineseNameIdentify) { chsNames = ChineseName.Match(text, i); if (chsNames != null) { foreach (string name in chsNames) { WordAttribute wa = new WordAttribute(name, POS.POS_A_NR, 0); result.Add(new PositionLength(i, name.Length, wa)); } } } WordAttribute fwa; if (_FirstCharDict.TryGetValue(fst, out fwa)) { result.Add(new PositionLength(i, 1, fwa)); } if (i < keyText.Length - 1) { uint doubleChar = ((uint)keyText[i] * 65536) + keyText[i + 1]; if (_DoubleCharDict.TryGetValue(doubleChar, out fwa)) { result.Add(new PositionLength(i, 2, fwa)); } } if (i >= keyText.Length - 2) { continue; } long tripleChar = ((long)keyText[i]) * 0x100000000 + (uint)(keyText[i + 1] * 65536) + keyText[i + 2]; if (_TripleCharDict.TryGetValue(tripleChar, out lenList)) { foreach (byte len in lenList) { if (len == 0) { break; } if (i + len > keyText.Length) { continue; } string key = keyText.Substring(i, len); WordAttribute wa; if (_WordDict.TryGetValue(key, out wa)) { if (chsNames != null) { bool find = false; foreach (string name in chsNames) { if (wa.Word == name) { find = true; break; } } if (find) { continue; } } result.Add(new PositionLength(i, len, wa)); } } } } return(result); }
private SuperLinkedList <WordInfo> PreSegment(String text) { SuperLinkedList <WordInfo> result = GetInitSegment(text); SuperLinkedListNode <WordInfo> cur = result.First; while (cur != null) { if (_Options.IgnoreSpace) { if (cur.Value.WordType == WordType.Space) { SuperLinkedListNode <WordInfo> lst = cur; cur = cur.Next; result.Remove(lst); continue; } } switch (cur.Value.WordType) { case WordType.SimplifiedChinese: string inputText = cur.Value.Word; WordType originalWordType = WordType.SimplifiedChinese; if (_Options.TraditionalChineseEnabled) { string simplified = Microsoft.VisualBasic.Strings.StrConv(cur.Value.Word, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0); if (simplified != cur.Value.Word) { originalWordType = WordType.TraditionalChinese; inputText = simplified; } } PanGu.Framework.AppendList <Dict.PositionLength> pls = _WordDictionary.GetAllMatchs(inputText, _Options.ChineseNameIdentify); PanGu.Match.ChsFullTextMatch chsMatch = new PanGu.Match.ChsFullTextMatch(_WordDictionary); chsMatch.Options = _Options; chsMatch.Parameters = _Parameters; SuperLinkedList <WordInfo> chsMatchWords = chsMatch.Match(pls.Items, cur.Value.Word, pls.Count); SuperLinkedListNode <WordInfo> curChsMatch = chsMatchWords.First; while (curChsMatch != null) { WordInfo wi = curChsMatch.Value; wi.Position += cur.Value.Position; wi.OriginalWordType = originalWordType; wi.WordType = originalWordType; if (_Options.OutputSimplifiedTraditional) { if (_Options.TraditionalChineseEnabled) { string newWord; WordType wt; if (originalWordType == WordType.SimplifiedChinese) { newWord = Microsoft.VisualBasic.Strings.StrConv(wi.Word, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0); wt = WordType.TraditionalChinese; } else { newWord = Microsoft.VisualBasic.Strings.StrConv(wi.Word, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0); wt = WordType.SimplifiedChinese; } if (newWord != wi.Word) { WordInfo newWordInfo = new WordInfo(wi); newWordInfo.Word = newWord; newWordInfo.OriginalWordType = originalWordType; newWordInfo.WordType = wt; newWordInfo.Rank = _Parameters.SimplifiedTraditionalRank; newWordInfo.Position = wi.Position; chsMatchWords.AddBefore(curChsMatch, newWordInfo); } } } curChsMatch = curChsMatch.Next; } SuperLinkedListNode <WordInfo> lst = result.AddAfter(cur, chsMatchWords); SuperLinkedListNode <WordInfo> removeItem = cur; cur = lst.Next; result.Remove(removeItem); break; case WordType.English: cur.Value.Rank = _Parameters.EnglishRank; List <string> output; cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word); if (_Options.EnglishSegment) { string lower = cur.Value.Word.ToLower(); if (lower != cur.Value.Word) { result.AddBefore(cur, new WordInfo(lower, cur.Value.Position, POS.POS_A_NX, 1, _Parameters.EnglishLowerRank, WordType.English, WordType.English)); } string stem = GetStem(lower); if (!string.IsNullOrEmpty(stem)) { if (lower != stem) { result.AddBefore(cur, new WordInfo(stem, cur.Value.Position, POS.POS_A_NX, 1, _Parameters.EnglishStemRank, WordType.English, WordType.English)); } } } else if (_Options.IgnoreCapital) { cur.Value.Word = cur.Value.Word.ToLower(); } if (_Options.EnglishMultiDimensionality) { bool needSplit = false; foreach (char c in cur.Value.Word) { if ((c >= '0' && c <= '9') || (c == '_')) { needSplit = true; break; } } if (needSplit) { if (Framework.Regex.GetMatchStrings(cur.Value.Word, PATTERNS, true, out output)) { int outputCount = 0; foreach (string str in output) { if (!string.IsNullOrEmpty(str)) { outputCount++; if (outputCount > 1) { break; } } } if (outputCount > 1) { int position = cur.Value.Position; foreach (string splitWord in output) { if (string.IsNullOrEmpty(splitWord)) { continue; } WordInfo wi; if (splitWord[0] >= '0' && splitWord[0] <= '9') { wi = new WordInfo(splitWord, POS.POS_A_M, 1); wi.Position = position; wi.Rank = _Parameters.NumericRank; wi.OriginalWordType = WordType.English; wi.WordType = WordType.Numeric; } else { wi = new WordInfo(splitWord, POS.POS_A_NX, 1); wi.Position = position; wi.Rank = _Parameters.EnglishRank; wi.OriginalWordType = WordType.English; wi.WordType = WordType.English; } result.AddBefore(cur, wi); position += splitWord.Length; } } } } } if (!MergeEnglishSpecialWord(text, result, ref cur)) { cur = cur.Next; } break; case WordType.Numeric: cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word); cur.Value.Rank = _Parameters.NumericRank; if (!MergeEnglishSpecialWord(text, result, ref cur)) { cur = cur.Next; } //cur = cur.Next; break; case WordType.Symbol: cur.Value.Rank = _Parameters.SymbolRank; cur = cur.Next; break; default: cur = cur.Next; break; } } return(result); }