Пример #1
0
        private SuperLinkedList <WordInfo> PreSegment(String text)
        {
            SuperLinkedList <WordInfo> result = GetInitSegment(text);

            SuperLinkedListNode <WordInfo> cur = result.First;

            while (cur != null)
            {
                if (_Options.IgnoreSpace)
                {
                    if (cur.Value.WordType == WordType.Space)
                    {
                        SuperLinkedListNode <WordInfo> lst = cur;
                        cur = cur.Next;
                        result.Remove(lst);
                        continue;
                    }
                }

                switch (cur.Value.WordType)
                {
                case WordType.SimplifiedChinese:

                    string inputText = cur.Value.Word;

                    WordType originalWordType = WordType.SimplifiedChinese;

                    if (_Options.TraditionalChineseEnabled)
                    {
                        string simplified = Microsoft.VisualBasic.Strings.StrConv(cur.Value.Word, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0);

                        if (simplified != cur.Value.Word)
                        {
                            originalWordType = WordType.TraditionalChinese;
                            inputText        = simplified;
                        }
                    }

                    微博舆论.Framework.AppendList <Dict.PositionLength> pls = _WordDictionary.GetAllMatchs(inputText, _Options.ChineseNameIdentify);
                    微博舆论.Match.ChsFullTextMatch chsMatch = new 微博舆论.Match.ChsFullTextMatch(_WordDictionary);
                    chsMatch.Options    = _Options;
                    chsMatch.Parameters = _Parameters;
                    SuperLinkedList <WordInfo> chsMatchWords = chsMatch.Match(pls.Items, cur.Value.Word, pls.Count);

                    SuperLinkedListNode <WordInfo> curChsMatch = chsMatchWords.First;
                    while (curChsMatch != null)
                    {
                        WordInfo wi = curChsMatch.Value;

                        wi.Position        += cur.Value.Position;
                        wi.OriginalWordType = originalWordType;
                        wi.WordType         = originalWordType;

                        if (_Options.OutputSimplifiedTraditional)
                        {
                            if (_Options.TraditionalChineseEnabled)
                            {
                                string   newWord;
                                WordType wt;

                                if (originalWordType == WordType.SimplifiedChinese)
                                {
                                    newWord = Microsoft.VisualBasic.Strings.StrConv(wi.Word,
                                                                                    Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);
                                    wt = WordType.TraditionalChinese;
                                }
                                else
                                {
                                    newWord = Microsoft.VisualBasic.Strings.StrConv(wi.Word,
                                                                                    Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0);
                                    wt = WordType.SimplifiedChinese;
                                }

                                if (newWord != wi.Word)
                                {
                                    WordInfo newWordInfo = new WordInfo(wi);
                                    newWordInfo.Word             = newWord;
                                    newWordInfo.OriginalWordType = originalWordType;
                                    newWordInfo.WordType         = wt;
                                    newWordInfo.Rank             = _Parameters.SimplifiedTraditionalRank;
                                    newWordInfo.Position         = wi.Position;
                                    chsMatchWords.AddBefore(curChsMatch, newWordInfo);
                                }
                            }
                        }

                        curChsMatch = curChsMatch.Next;
                    }

                    SuperLinkedListNode <WordInfo> lst        = result.AddAfter(cur, chsMatchWords);
                    SuperLinkedListNode <WordInfo> removeItem = cur;
                    cur = lst.Next;
                    result.Remove(removeItem);
                    break;

                case WordType.English:
                    cur.Value.Rank = _Parameters.EnglishRank;
                    List <string> output;
                    cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word);

                    if (_Options.EnglishSegment)
                    {
                        string lower = cur.Value.Word.ToLower();

                        if (lower != cur.Value.Word)
                        {
                            result.AddBefore(cur, new WordInfo(lower, cur.Value.Position, POS.POS_A_NX, 1,
                                                               _Parameters.EnglishLowerRank, WordType.English, WordType.English));
                        }

                        string stem = GetStem(lower);

                        if (!string.IsNullOrEmpty(stem))
                        {
                            if (lower != stem)
                            {
                                result.AddBefore(cur, new WordInfo(stem, cur.Value.Position, POS.POS_A_NX, 1,
                                                                   _Parameters.EnglishStemRank, WordType.English, WordType.English));
                            }
                        }
                    }
                    else if (_Options.IgnoreCapital)
                    {
                        cur.Value.Word = cur.Value.Word.ToLower();
                    }

                    if (_Options.EnglishMultiDimensionality)
                    {
                        bool needSplit = false;

                        foreach (char c in cur.Value.Word)
                        {
                            if ((c >= '0' && c <= '9') || (c == '_'))
                            {
                                needSplit = true;
                                break;
                            }
                        }

                        if (needSplit)
                        {
                            if (Framework.Regex.GetMatchStrings(cur.Value.Word, PATTERNS, true, out output))
                            {
                                int outputCount = 0;

                                foreach (string str in output)
                                {
                                    if (!string.IsNullOrEmpty(str))
                                    {
                                        outputCount++;

                                        if (outputCount > 1)
                                        {
                                            break;
                                        }
                                    }
                                }


                                if (outputCount > 1)
                                {
                                    int position = cur.Value.Position;

                                    foreach (string splitWord in output)
                                    {
                                        if (string.IsNullOrEmpty(splitWord))
                                        {
                                            continue;
                                        }

                                        WordInfo wi;

                                        if (splitWord[0] >= '0' && splitWord[0] <= '9')
                                        {
                                            wi                  = new WordInfo(splitWord, POS.POS_A_M, 1);
                                            wi.Position         = position;
                                            wi.Rank             = _Parameters.NumericRank;
                                            wi.OriginalWordType = WordType.English;
                                            wi.WordType         = WordType.Numeric;
                                        }
                                        else
                                        {
                                            wi                  = new WordInfo(splitWord, POS.POS_A_NX, 1);
                                            wi.Position         = position;
                                            wi.Rank             = _Parameters.EnglishRank;
                                            wi.OriginalWordType = WordType.English;
                                            wi.WordType         = WordType.English;
                                        }

                                        result.AddBefore(cur, wi);
                                        position += splitWord.Length;
                                    }
                                }
                            }
                        }
                    }

                    if (!MergeEnglishSpecialWord(text, result, ref cur))
                    {
                        cur = cur.Next;
                    }

                    break;

                case WordType.Numeric:
                    cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word);
                    cur.Value.Rank = _Parameters.NumericRank;

                    if (!MergeEnglishSpecialWord(text, result, ref cur))
                    {
                        cur = cur.Next;
                    }

                    //cur = cur.Next;
                    break;

                case WordType.Symbol:
                    cur.Value.Rank = _Parameters.SymbolRank;
                    cur            = cur.Next;
                    break;

                default:
                    cur = cur.Next;
                    break;
                }
            }


            return(result);
        }
Пример #2
0
        private void ProcessAfterSegment(string orginalText, SuperLinkedList <WordInfo> result)
        {
            //匹配同义词
            if (_Options.SynonymOutput)
            {
                SuperLinkedListNode <WordInfo> node = result.First;

                while (node != null)
                {
                    List <string> synonyms = _Synonym.GetSynonyms(node.Value.Word);

                    if (synonyms != null)
                    {
                        foreach (string word in synonyms)
                        {
                            node = result.AddAfter(node, new WordInfo(word, node.Value.Position,
                                                                      node.Value.Pos, node.Value.Frequency, _Parameters.SymbolRank,
                                                                      WordType.Synonym, node.Value.WordType));
                        }
                    }

                    node = node.Next;
                }
            }

            //通配符匹配
            if (_Options.WildcardOutput)
            {
                SuperLinkedListNode <WordInfo> node = result.First;

                while (node != null)
                {
                    List <Dict.Wildcard.WildcardInfo> wildcards =
                        _Wildcard.GetWildcards(node.Value.Word);

                    if (wildcards.Count > 0)
                    {
                        for (int i = 0; i < wildcards.Count; i++)
                        {
                            Dict.Wildcard.WildcardInfo wildcardInfo = wildcards[i];

                            int count = wildcardInfo.Segments.Count;
                            if (!_Options.WildcardSegment)
                            {
                                count = 1;
                            }

                            for (int j = 0; j < count; j++)
                            {
                                WordInfo wi = wildcardInfo.Segments[j];

                                if (wi.Word == node.Value.Word)
                                {
                                    continue;
                                }

                                wi.Rank      = _Parameters.WildcardRank;
                                wi.Position += node.Value.Position;
                                result.AddBefore(node, wi);
                            }
                        }
                    }

                    node = node.Next;

                    if (node != null)
                    {
                        //过滤英文分词时多元分词重复输出的问题
                        if (node.Previous.Value.Word.ToLower() == node.Value.Word.ToLower())
                        {
                            node = node.Next;
                        }
                    }
                }
            }

            //用户自定义规则
            if (_Options.CustomRule)
            {
                ICustomRule rule = CustomRule.GetCustomRule(_Parameters.CustomRuleAssemblyFileName,
                                                            _Parameters.CustomRuleFullClassName);

                if (rule != null)
                {
                    rule.Text = orginalText;
                    rule.AfterSegment(result);
                }
            }
        }
Пример #3
0
        /// <summary>
        /// 合并英文专用词。
        /// 如果字典中有英文专用词如U.S.A, C++.C#等
        /// 需要对初步分词后的英文和字母进行合并
        /// </summary>
        /// <param name="words"></param>
        /// <param name="start"></param>
        /// <param name="end"></param>
        /// <returns></returns>
        //private String MergeEnglishSpecialWord(CExtractWords extractWords, ArrayList words, int start, ref int end)
        //{
        //    StringBuilder str = new StringBuilder();

        //    int i;

        //    for (i = start; i < words.Count; i++)
        //    {
        //        string word = (string)words[i];

        //        //word 为空或者为空格回车换行等分割符号,中断扫描
        //        if (word.Trim() == "")
        //        {
        //            break;
        //        }

        //        //如果遇到中文,中断扫描
        //        if (word[0] >= 0x4e00 && word[0] <= 0x9fa5)
        //        {
        //            break;
        //        }

        //        str.Append(word);
        //    }

        //    String mergeString = str.ToString();
        //    List<T_WordInfo> exWords = extractWords.ExtractFullText(mergeString);

        //    if (exWords.Count == 1)
        //    {
        //        T_WordInfo info = (T_WordInfo)exWords[0];
        //        if (info.Word.Length == mergeString.Length)
        //        {
        //            end = i;
        //            return mergeString;
        //        }
        //    }

        //    return null;

        //}

        private bool MergeEnglishSpecialWord(string orginalText, SuperLinkedList <WordInfo> wordInfoList, ref SuperLinkedListNode <WordInfo> current)
        {
            SuperLinkedListNode <WordInfo> cur = current;

            cur = cur.Next;

            int last = -1;

            while (cur != null)
            {
                if (cur.Value.WordType == WordType.Symbol || cur.Value.WordType == WordType.English)
                {
                    last = cur.Value.Position + cur.Value.Word.Length;
                    cur  = cur.Next;
                }
                else
                {
                    break;
                }
            }


            if (last >= 0)
            {
                int first = current.Value.Position;

                string newWord = orginalText.Substring(first, last - first);

                WordAttribute wa = _WordDictionary.GetWordAttr(newWord);

                if (wa == null)
                {
                    return(false);
                }

                while (current != cur)
                {
                    SuperLinkedListNode <WordInfo> removeItem = current;
                    current = current.Next;
                    wordInfoList.Remove(removeItem);
                }

                WordInfo newWordInfo = new WordInfo(new 微博舆论.Dict.PositionLength(first, last - first,
                                                                                 wa), orginalText, _Parameters);

                newWordInfo.WordType = WordType.English;
                newWordInfo.Rank     = _Parameters.EnglishRank;

                if (_Options.EnglishSegment)
                {
                    string lowerWord = newWordInfo.Word.ToLower();

                    if (lowerWord != newWordInfo.Word)
                    {
                        if (current == null)
                        {
                            wordInfoList.AddLast(newWordInfo);
                        }
                        else
                        {
                            wordInfoList.AddBefore(current, newWordInfo);
                        }
                    }

                    newWordInfo = new WordInfo(lowerWord, newWordInfo.Position, newWordInfo.Pos, newWordInfo.Frequency, _Parameters.EnglishLowerRank, newWordInfo.WordType,
                                               newWordInfo.OriginalWordType);
                }
                else if (_Options.IgnoreCapital)
                {
                    newWordInfo.Word = newWordInfo.Word.ToLower();
                }

                if (current == null)
                {
                    wordInfoList.AddLast(newWordInfo);
                }
                else
                {
                    wordInfoList.AddBefore(current, newWordInfo);
                }

                return(true);
            }


            return(false);
        }