//歧义消除
        private void ClearDifferentMeanings()
        {
            int len = length - 1;

            //如果当前词语中含有更短的词语时进行再分词处理,否则继续向后进行歧义检测
            if (assoStream.BackToLastWordEnd())
            {
                int start = assoStream.Step;
                int end   = start;
                assoStream.Reset();

                char c = buffer[end];
                //对多出的词继续进行词语匹配
                while (end < length && assoStream.Associate(c = buffer[end]))
                {
                    Prep(c);
                    end++;
                }

                //如果缓冲区中的内容全部匹配完后继续中文本流中读取
                if (end == length)
                {
                    while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c))
                    {
                        Prep(c);
                    }
                }

                //如果预处理的词出现过词语则回朔到最后一个词语
                if (!assoStream.IsWordEnd && assoStream.IsOccurWord)
                {
                    assoStream.BackToLastWordEnd();
                    bufferIndex = bufferIndex - prepLength + assoStream.Step;
                    offset      = offset - prepLength + assoStream.Step;
                    prepLength  = assoStream.Step;
                }

                ///如果存在一个完整的词语并且词语长度比缓冲区中的上一个词语更长时将缓冲区切断至词语长度,并将多余字符放到预读缓冲区,否则回朔本次处理
                if (assoStream.IsWordEnd && (prepLength > len || prepLength > start && len > start + 1 || prepLength >= start && Char.GetUnicodeCategory(c) != UnicodeCategory.OtherLetter))
                {
                    len = start;
                }
                else
                {
                    if (end == length)
                    {
                        bufferIndex = bufferIndex + len - prepLength - start;
                        offset      = offset + len - prepLength - start;
                    }
                    prepLength = 0;
                }
            }
            else if (len == 2)
            {
                //保存词性
                //WordPart wp1 = assoStream.GetWordPart();
                assoStream.Reset();
                char c = buffer[1];
                //从词语的第二个词开始依次进行词语匹配
                if (assoStream.Associate(c) && assoStream.Associate(buffer[2]))
                {
                    Prep(c);
                    Prep(buffer[2]);
                    while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c))
                    {
                        Prep(c);
                    }
                    if (!assoStream.IsWordEnd && assoStream.IsOccurWord)
                    {
                        assoStream.BackToLastWordEnd();
                        bufferIndex = bufferIndex - prepLength + assoStream.Step;
                        offset      = offset - prepLength + assoStream.Step;
                        prepLength  = assoStream.Step;
                    }
                    if (assoStream.IsWordEnd)
                    {
                        if (prepLength == 2)
                        {
                            //词性组合规则,尚未实现
                            //WordPart wp2 = assoStream.GetWordPart();
                            //if(WordPart.Combo(wp1, wp2)
                            //    len = 2;
                            //else
                            //    len = 1;

                            //临时方案
                            assoStream.Reset();
                            if (assoStream.Associate(prepBuffer[1]) && assoStream.Associate(c))
                            {
                                char[] tmp = new char[MAX_WORD_LEN];
                                tmp[0] = prepBuffer[1];
                                tmp[1] = c;
                                int tmpLength = 2;
                                while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c))
                                {
                                    tmp[tmpLength++] = c;
                                }
                                if (!assoStream.IsWordEnd && assoStream.IsOccurWord)
                                {
                                    assoStream.BackToLastWordEnd();
                                    bufferIndex = bufferIndex - tmpLength + assoStream.Step;
                                    offset      = offset - tmpLength + assoStream.Step;
                                    tmpLength   = assoStream.Step;
                                }
                                if (assoStream.IsWordEnd)
                                {
                                    tmp.CopyTo(prepBuffer, 0);
                                    prepLength = tmpLength;
                                }
                                else
                                {
                                    bufferIndex = bufferIndex - tmpLength;
                                    offset      = offset - tmpLength;
                                    prepLength  = 0;
                                }
                            }
                            else if (LinkRule.Test(buffer[0]))
                            {
                                len = 1;
                            }
                            else
                            {
                                bufferIndex = bufferIndex - prepLength + 1;
                                offset      = offset - prepLength + 1;
                                prepLength  = 0;
                            }
                        }
                        else
                        {
                            len = 1;
                        }
                    }
                    else
                    {
                        bufferIndex = bufferIndex - prepLength + 1;
                        offset      = offset - prepLength + 1;
                        prepLength  = 0;
                    }
                }
            }
            length = len;
        }
        /// <summary>
        /// 下一个词语
        /// </summary>
        /// <returns></returns>
        public string Next()
        {
            if (length > 0)
            {
                return(Flush());
            }
            //重置联合器
            assoStream.Reset();

            //读取下一个字符
            char c = GetNextChar();

            //如果缓冲区里已经没有内容则终止当前读取
            if (dataLen < bufferIndex)
            {
                return(Flush());
            }

            //将字符放入输出数组
            Push(c);

            //根据首字符的类型选择不同的读取过程
            switch (Char.GetUnicodeCategory(c))
            {
            //如果是数字则读取之后的全部数字直到遇到非数字字符
            case UnicodeCategory.DecimalDigitNumber:
                while (
                    (
                        Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.DecimalDigitNumber ||
                        c == '.' && buffer[length - 1] != '.'
                    ) &&
                    length < MAX_WORD_LEN
                    )
                {
                    Push(c);
                }
                Back();
                return(Flush().Trim('.'));

            //如果是英文字符则读取之后的全部英文字符直到遇到非英文字符
            case UnicodeCategory.LowercaseLetter:
                while (
                    (
                        Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.LowercaseLetter ||
                        c == '+' ||
                        c == '#'
                    ) &&
                    length < MAX_WORD_LEN
                    )
                {
                    Push(c);
                }
                Back();
                return(Flush());

            //如果是中文字符则开始中文分词过程
            case UnicodeCategory.OtherLetter:
                if (c > 19967 && c < 40870 || c > 12353 && c < 12436)
                {
                    assoStream.Associate(c);
                    //读取并检测下一个字符是否是中文字符
                    while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && length < MAX_WORD_LEN)
                    {
                        //通过字典树向下匹配词语直至无法匹配
                        if (!assoStream.IsBegin && assoStream.HasChildren && assoStream.Associate(c))
                        {
                            Push(c);
                            continue;
                        }
                        //如果出现过的匹配成功的词语则昭会上一个词语
                        if (!assoStream.IsWordEnd && assoStream.IsOccurWord)
                        {
                            assoStream.BackToLastWordEnd();
                            bufferIndex = bufferIndex - length + assoStream.Step;
                            offset      = offset - length + assoStream.Step;
                            length      = assoStream.Step;
                        }
                        //如果正好是一个完整的词语则中断处理
                        if (assoStream.IsWordEnd)
                        {
                            Push(c);
                            ClearDifferentMeanings();
                        }
                        //否则进行人名的匹配
                        else
                        {
                            if (!LinkRule.Test(buffer[0]))
                            {
                                Push(c);
                                StrangeWordMacth();
                            }
                            else
                            {
                                bufferIndex = bufferIndex - length;
                                offset      = offset - length;
                                length      = 1;
                                return(Flush());
                            }
                        }
                        break;
                    }
                }
                else
                {
                    while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && length < MAX_WORD_LEN)
                    {
                        Push(c);
                    }
                }
                Back();
                return(Flush());

            //如果是非可读字符(包括标点符号,空格等)则直接进入下一个处理过程
            default:
                length = 0;
                return(String.Empty);
            }
        }
Exemple #3
0
        /// <summary>
        /// 对字符序列进行生词检测
        /// </summary>
        /// <param name="word">字符序列</param>
        /// <param name="length">有效字符数量</param>
        /// <returns></returns>
        public static int Match(Char[] word, int length)
        {
            int nStart = 0;

            //姓氏检测
            if (FamilyNameRule.Test(((int)word[0] << 2) + (int)word[1]))
            {
                nStart = 2;
            }
            else if (FamilyNameRule.Test((int)word[0]) && !LinkRule.Test(word[1]))
            {
                nStart = 1;
            }
            else
            {
                AssociateStream assoStream = new AssociateStream();
                //连词检测,如果不是连词则向后进行词语匹配
                if (!LinkRule.Test(word[1]))
                {
                    for (nStart = 1; nStart < length;)
                    {
                        assoStream.Associate(word[nStart++]);
                        if (!assoStream.IsBegin && assoStream.Associate(word[nStart]))
                        {
                            return(nStart - 1);
                        }
                        else
                        {
                            assoStream.Reset();
                        }
                    }
                }
                //地名检测
                for (nStart = 0; nStart < length - 1;)
                {
                    assoStream.Reset();
                    if (!LinkRule.Test(word[nStart++]) && PlaceRule.Test(word[nStart]))
                    {
                        return(nStart + 1);
                    }
                }
                return(1);
            }
            //如果检测到姓氏并且剩余字符数小于3个字符则整个串是一个完整姓名
            if (length - nStart <= 2)
            {
                return(length);
            }
            //如果串的长度大于姓氏的长度则开始检测有效的名字长度
            if (length > nStart)
            {
                int             nEnd       = nStart + 1;
                AssociateStream assoStream = new AssociateStream();

                //如果下一个字是连接字或者当前字符和下一个字无法组合成一个已知词则将当前字被确认为时姓名的一个字
                if (LinkRule.Test(word[nEnd + 1]) || !(assoStream.Associate(word[nEnd]) && assoStream.Associate(word[nEnd + 1])))
                {
                    nEnd++;
                }
                nStart = nEnd;
            }
            return(nStart);
        }