コード例 #1
0
        //歧义消除
        private void ClearDifferentMeanings()
        {
            int len = length - 1;

            //如果当前词语中含有更短的词语时进行再分词处理,否则继续向后进行歧义检测
            if (assoStream.BackToLastWordEnd())
            {
                int start = assoStream.Step;
                int end   = start;
                assoStream.Reset();

                char c = buffer[end];
                //对多出的词继续进行词语匹配
                while (end < length && assoStream.Associate(c = buffer[end]))
                {
                    Prep(c);
                    end++;
                }

                //如果缓冲区中的内容全部匹配完后继续中文本流中读取
                if (end == length)
                {
                    while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c))
                    {
                        Prep(c);
                    }
                }

                //如果预处理的词出现过词语则回朔到最后一个词语
                if (!assoStream.IsWordEnd && assoStream.IsOccurWord)
                {
                    assoStream.BackToLastWordEnd();
                    bufferIndex = bufferIndex - prepLength + assoStream.Step;
                    offset      = offset - prepLength + assoStream.Step;
                    prepLength  = assoStream.Step;
                }

                ///如果存在一个完整的词语并且词语长度比缓冲区中的上一个词语更长时将缓冲区切断至词语长度,并将多余字符放到预读缓冲区,否则回朔本次处理
                if (assoStream.IsWordEnd && (prepLength > len || prepLength > start && len > start + 1 || prepLength >= start && Char.GetUnicodeCategory(c) != UnicodeCategory.OtherLetter))
                {
                    len = start;
                }
                else
                {
                    if (end == length)
                    {
                        bufferIndex = bufferIndex + len - prepLength - start;
                        offset      = offset + len - prepLength - start;
                    }
                    prepLength = 0;
                }
            }
            else if (len == 2)
            {
                //保存词性
                //WordPart wp1 = assoStream.GetWordPart();
                assoStream.Reset();
                char c = buffer[1];
                //从词语的第二个词开始依次进行词语匹配
                if (assoStream.Associate(c) && assoStream.Associate(buffer[2]))
                {
                    Prep(c);
                    Prep(buffer[2]);
                    while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c))
                    {
                        Prep(c);
                    }
                    if (!assoStream.IsWordEnd && assoStream.IsOccurWord)
                    {
                        assoStream.BackToLastWordEnd();
                        bufferIndex = bufferIndex - prepLength + assoStream.Step;
                        offset      = offset - prepLength + assoStream.Step;
                        prepLength  = assoStream.Step;
                    }
                    if (assoStream.IsWordEnd)
                    {
                        if (prepLength == 2)
                        {
                            //词性组合规则,尚未实现
                            //WordPart wp2 = assoStream.GetWordPart();
                            //if(WordPart.Combo(wp1, wp2)
                            //    len = 2;
                            //else
                            //    len = 1;

                            //临时方案
                            assoStream.Reset();
                            if (assoStream.Associate(prepBuffer[1]) && assoStream.Associate(c))
                            {
                                char[] tmp = new char[MAX_WORD_LEN];
                                tmp[0] = prepBuffer[1];
                                tmp[1] = c;
                                int tmpLength = 2;
                                while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c))
                                {
                                    tmp[tmpLength++] = c;
                                }
                                if (!assoStream.IsWordEnd && assoStream.IsOccurWord)
                                {
                                    assoStream.BackToLastWordEnd();
                                    bufferIndex = bufferIndex - tmpLength + assoStream.Step;
                                    offset      = offset - tmpLength + assoStream.Step;
                                    tmpLength   = assoStream.Step;
                                }
                                if (assoStream.IsWordEnd)
                                {
                                    tmp.CopyTo(prepBuffer, 0);
                                    prepLength = tmpLength;
                                }
                                else
                                {
                                    bufferIndex = bufferIndex - tmpLength;
                                    offset      = offset - tmpLength;
                                    prepLength  = 0;
                                }
                            }
                            else if (LinkRule.Test(buffer[0]))
                            {
                                len = 1;
                            }
                            else
                            {
                                bufferIndex = bufferIndex - prepLength + 1;
                                offset      = offset - prepLength + 1;
                                prepLength  = 0;
                            }
                        }
                        else
                        {
                            len = 1;
                        }
                    }
                    else
                    {
                        bufferIndex = bufferIndex - prepLength + 1;
                        offset      = offset - prepLength + 1;
                        prepLength  = 0;
                    }
                }
            }
            length = len;
        }
コード例 #2
0
ファイル: StrangeWord.cs プロジェクト: jjwangnlp/cnopenblog
        /// <summary>
        /// 对字符序列进行生词检测
        /// </summary>
        /// <param name="word">字符序列</param>
        /// <param name="length">有效字符数量</param>
        /// <returns></returns>
        public static int Match(Char[] word, int length)
        {
            int nStart = 0;

            //姓氏检测
            if (FamilyNameRule.Test(((int)word[0] << 2) + (int)word[1]))
            {
                nStart = 2;
            }
            else if (FamilyNameRule.Test((int)word[0]) && !LinkRule.Test(word[1]))
            {
                nStart = 1;
            }
            else
            {
                AssociateStream assoStream = new AssociateStream();
                //连词检测,如果不是连词则向后进行词语匹配
                if (!LinkRule.Test(word[1]))
                {
                    for (nStart = 1; nStart < length;)
                    {
                        assoStream.Associate(word[nStart++]);
                        if (!assoStream.IsBegin && assoStream.Associate(word[nStart]))
                        {
                            return(nStart - 1);
                        }
                        else
                        {
                            assoStream.Reset();
                        }
                    }
                }
                //地名检测
                for (nStart = 0; nStart < length - 1;)
                {
                    assoStream.Reset();
                    if (!LinkRule.Test(word[nStart++]) && PlaceRule.Test(word[nStart]))
                    {
                        return(nStart + 1);
                    }
                }
                return(1);
            }
            //如果检测到姓氏并且剩余字符数小于3个字符则整个串是一个完整姓名
            if (length - nStart <= 2)
            {
                return(length);
            }
            //如果串的长度大于姓氏的长度则开始检测有效的名字长度
            if (length > nStart)
            {
                int             nEnd       = nStart + 1;
                AssociateStream assoStream = new AssociateStream();

                //如果下一个字是连接字或者当前字符和下一个字无法组合成一个已知词则将当前字被确认为时姓名的一个字
                if (LinkRule.Test(word[nEnd + 1]) || !(assoStream.Associate(word[nEnd]) && assoStream.Associate(word[nEnd + 1])))
                {
                    nEnd++;
                }
                nStart = nEnd;
            }
            return(nStart);
        }