//歧义消除 private void ClearDifferentMeanings() { int len = length - 1; //如果当前词语中含有更短的词语时进行再分词处理,否则继续向后进行歧义检测 if (assoStream.BackToLastWordEnd()) { int start = assoStream.Step; int end = start; assoStream.Reset(); char c = buffer[end]; //对多出的词继续进行词语匹配 while (end < length && assoStream.Associate(c = buffer[end])) { Prep(c); end++; } //如果缓冲区中的内容全部匹配完后继续中文本流中读取 if (end == length) { while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c)) { Prep(c); } } //如果预处理的词出现过词语则回朔到最后一个词语 if (!assoStream.IsWordEnd && assoStream.IsOccurWord) { assoStream.BackToLastWordEnd(); bufferIndex = bufferIndex - prepLength + assoStream.Step; offset = offset - prepLength + assoStream.Step; prepLength = assoStream.Step; } ///如果存在一个完整的词语并且词语长度比缓冲区中的上一个词语更长时将缓冲区切断至词语长度,并将多余字符放到预读缓冲区,否则回朔本次处理 if (assoStream.IsWordEnd && (prepLength > len || prepLength > start && len > start + 1 || prepLength >= start && Char.GetUnicodeCategory(c) != UnicodeCategory.OtherLetter)) { len = start; } else { if (end == length) { bufferIndex = bufferIndex + len - prepLength - start; offset = offset + len - prepLength - start; } prepLength = 0; } } else if (len == 2) { //保存词性 //WordPart wp1 = assoStream.GetWordPart(); assoStream.Reset(); char c = buffer[1]; //从词语的第二个词开始依次进行词语匹配 if (assoStream.Associate(c) && assoStream.Associate(buffer[2])) { Prep(c); Prep(buffer[2]); while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c)) { Prep(c); } if (!assoStream.IsWordEnd && assoStream.IsOccurWord) { assoStream.BackToLastWordEnd(); bufferIndex = bufferIndex - prepLength + assoStream.Step; offset = offset - prepLength + assoStream.Step; prepLength = assoStream.Step; } if (assoStream.IsWordEnd) { if (prepLength == 2) { //词性组合规则,尚未实现 //WordPart wp2 = assoStream.GetWordPart(); //if(WordPart.Combo(wp1, wp2) // len = 2; //else // len = 1; //临时方案 assoStream.Reset(); if (assoStream.Associate(prepBuffer[1]) && assoStream.Associate(c)) { char[] tmp = new char[MAX_WORD_LEN]; tmp[0] = prepBuffer[1]; tmp[1] = c; int tmpLength = 2; while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c)) { tmp[tmpLength++] = c; } if (!assoStream.IsWordEnd && assoStream.IsOccurWord) { assoStream.BackToLastWordEnd(); bufferIndex = bufferIndex - tmpLength + assoStream.Step; offset = offset - tmpLength + assoStream.Step; tmpLength = assoStream.Step; } if (assoStream.IsWordEnd) { tmp.CopyTo(prepBuffer, 0); prepLength = tmpLength; } else { bufferIndex = bufferIndex - tmpLength; offset = offset - tmpLength; prepLength = 0; } } else if (LinkRule.Test(buffer[0])) { len = 1; } else { bufferIndex = bufferIndex - prepLength + 1; offset = offset - prepLength + 1; prepLength = 0; } } else { len = 1; } } else { bufferIndex = bufferIndex - prepLength + 1; offset = offset - prepLength + 1; prepLength = 0; } } } length = len; }
/// <summary> /// 对字符序列进行生词检测 /// </summary> /// <param name="word">字符序列</param> /// <param name="length">有效字符数量</param> /// <returns></returns> public static int Match(Char[] word, int length) { int nStart = 0; //姓氏检测 if (FamilyNameRule.Test(((int)word[0] << 2) + (int)word[1])) { nStart = 2; } else if (FamilyNameRule.Test((int)word[0]) && !LinkRule.Test(word[1])) { nStart = 1; } else { AssociateStream assoStream = new AssociateStream(); //连词检测,如果不是连词则向后进行词语匹配 if (!LinkRule.Test(word[1])) { for (nStart = 1; nStart < length;) { assoStream.Associate(word[nStart++]); if (!assoStream.IsBegin && assoStream.Associate(word[nStart])) { return(nStart - 1); } else { assoStream.Reset(); } } } //地名检测 for (nStart = 0; nStart < length - 1;) { assoStream.Reset(); if (!LinkRule.Test(word[nStart++]) && PlaceRule.Test(word[nStart])) { return(nStart + 1); } } return(1); } //如果检测到姓氏并且剩余字符数小于3个字符则整个串是一个完整姓名 if (length - nStart <= 2) { return(length); } //如果串的长度大于姓氏的长度则开始检测有效的名字长度 if (length > nStart) { int nEnd = nStart + 1; AssociateStream assoStream = new AssociateStream(); //如果下一个字是连接字或者当前字符和下一个字无法组合成一个已知词则将当前字被确认为时姓名的一个字 if (LinkRule.Test(word[nEnd + 1]) || !(assoStream.Associate(word[nEnd]) && assoStream.Associate(word[nEnd + 1]))) { nEnd++; } nStart = nEnd; } return(nStart); }