//歧义消除 private void ClearDifferentMeanings() { int len = length - 1; //如果当前词语中含有更短的词语时进行再分词处理,否则继续向后进行歧义检测 if (assoStream.BackToLastWordEnd()) { int start = assoStream.Step; int end = start; assoStream.Reset(); char c = buffer[end]; //对多出的词继续进行词语匹配 while (end < length && assoStream.Associate(c = buffer[end])) { Prep(c); end++; } //如果缓冲区中的内容全部匹配完后继续中文本流中读取 if (end == length) { while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c)) { Prep(c); } } //如果预处理的词出现过词语则回朔到最后一个词语 if (!assoStream.IsWordEnd && assoStream.IsOccurWord) { assoStream.BackToLastWordEnd(); bufferIndex = bufferIndex - prepLength + assoStream.Step; offset = offset - prepLength + assoStream.Step; prepLength = assoStream.Step; } ///如果存在一个完整的词语并且词语长度比缓冲区中的上一个词语更长时将缓冲区切断至词语长度,并将多余字符放到预读缓冲区,否则回朔本次处理 if (assoStream.IsWordEnd && (prepLength > len || prepLength > start && len > start + 1 || prepLength >= start && Char.GetUnicodeCategory(c) != UnicodeCategory.OtherLetter)) { len = start; } else { if (end == length) { bufferIndex = bufferIndex + len - prepLength - start; offset = offset + len - prepLength - start; } prepLength = 0; } } else if (len == 2) { //保存词性 //WordPart wp1 = assoStream.GetWordPart(); assoStream.Reset(); char c = buffer[1]; //从词语的第二个词开始依次进行词语匹配 if (assoStream.Associate(c) && assoStream.Associate(buffer[2])) { Prep(c); Prep(buffer[2]); while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c)) { Prep(c); } if (!assoStream.IsWordEnd && assoStream.IsOccurWord) { assoStream.BackToLastWordEnd(); bufferIndex = bufferIndex - prepLength + assoStream.Step; offset = offset - prepLength + assoStream.Step; prepLength = assoStream.Step; } if (assoStream.IsWordEnd) { if (prepLength == 2) { //词性组合规则,尚未实现 //WordPart wp2 = assoStream.GetWordPart(); //if(WordPart.Combo(wp1, wp2) // len = 2; //else // len = 1; //临时方案 assoStream.Reset(); if (assoStream.Associate(prepBuffer[1]) && assoStream.Associate(c)) { char[] tmp = new char[MAX_WORD_LEN]; tmp[0] = prepBuffer[1]; tmp[1] = c; int tmpLength = 2; while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && assoStream.Associate(c)) { tmp[tmpLength++] = c; } if (!assoStream.IsWordEnd && assoStream.IsOccurWord) { assoStream.BackToLastWordEnd(); bufferIndex = bufferIndex - tmpLength + assoStream.Step; offset = offset - tmpLength + assoStream.Step; tmpLength = assoStream.Step; } if (assoStream.IsWordEnd) { tmp.CopyTo(prepBuffer, 0); prepLength = tmpLength; } else { bufferIndex = bufferIndex - tmpLength; offset = offset - tmpLength; prepLength = 0; } } else if (LinkRule.Test(buffer[0])) { len = 1; } else { bufferIndex = bufferIndex - prepLength + 1; offset = offset - prepLength + 1; prepLength = 0; } } else { len = 1; } } else { bufferIndex = bufferIndex - prepLength + 1; offset = offset - prepLength + 1; prepLength = 0; } } } length = len; }
/// <summary> /// 下一个词语 /// </summary> /// <returns></returns> public string Next() { if (length > 0) { return(Flush()); } //重置联合器 assoStream.Reset(); //读取下一个字符 char c = GetNextChar(); //如果缓冲区里已经没有内容则终止当前读取 if (dataLen < bufferIndex) { return(Flush()); } //将字符放入输出数组 Push(c); //根据首字符的类型选择不同的读取过程 switch (Char.GetUnicodeCategory(c)) { //如果是数字则读取之后的全部数字直到遇到非数字字符 case UnicodeCategory.DecimalDigitNumber: while ( ( Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.DecimalDigitNumber || c == '.' && buffer[length - 1] != '.' ) && length < MAX_WORD_LEN ) { Push(c); } Back(); return(Flush().Trim('.')); //如果是英文字符则读取之后的全部英文字符直到遇到非英文字符 case UnicodeCategory.LowercaseLetter: while ( ( Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.LowercaseLetter || c == '+' || c == '#' ) && length < MAX_WORD_LEN ) { Push(c); } Back(); return(Flush()); //如果是中文字符则开始中文分词过程 case UnicodeCategory.OtherLetter: if (c > 19967 && c < 40870 || c > 12353 && c < 12436) { assoStream.Associate(c); //读取并检测下一个字符是否是中文字符 while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && length < MAX_WORD_LEN) { //通过字典树向下匹配词语直至无法匹配 if (!assoStream.IsBegin && assoStream.HasChildren && assoStream.Associate(c)) { Push(c); continue; } //如果出现过的匹配成功的词语则昭会上一个词语 if (!assoStream.IsWordEnd && assoStream.IsOccurWord) { assoStream.BackToLastWordEnd(); bufferIndex = bufferIndex - length + assoStream.Step; offset = offset - length + assoStream.Step; length = assoStream.Step; } //如果正好是一个完整的词语则中断处理 if (assoStream.IsWordEnd) { Push(c); ClearDifferentMeanings(); } //否则进行人名的匹配 else { if (!LinkRule.Test(buffer[0])) { Push(c); StrangeWordMacth(); } else { bufferIndex = bufferIndex - length; offset = offset - length; length = 1; return(Flush()); } } break; } } else { while (Char.GetUnicodeCategory(c = GetNextChar()) == UnicodeCategory.OtherLetter && length < MAX_WORD_LEN) { Push(c); } } Back(); return(Flush()); //如果是非可读字符(包括标点符号,空格等)则直接进入下一个处理过程 default: length = 0; return(String.Empty); } }
/// <summary> /// 对字符序列进行生词检测 /// </summary> /// <param name="word">字符序列</param> /// <param name="length">有效字符数量</param> /// <returns></returns> public static int Match(Char[] word, int length) { int nStart = 0; //姓氏检测 if (FamilyNameRule.Test(((int)word[0] << 2) + (int)word[1])) { nStart = 2; } else if (FamilyNameRule.Test((int)word[0]) && !LinkRule.Test(word[1])) { nStart = 1; } else { AssociateStream assoStream = new AssociateStream(); //连词检测,如果不是连词则向后进行词语匹配 if (!LinkRule.Test(word[1])) { for (nStart = 1; nStart < length;) { assoStream.Associate(word[nStart++]); if (!assoStream.IsBegin && assoStream.Associate(word[nStart])) { return(nStart - 1); } else { assoStream.Reset(); } } } //地名检测 for (nStart = 0; nStart < length - 1;) { assoStream.Reset(); if (!LinkRule.Test(word[nStart++]) && PlaceRule.Test(word[nStart])) { return(nStart + 1); } } return(1); } //如果检测到姓氏并且剩余字符数小于3个字符则整个串是一个完整姓名 if (length - nStart <= 2) { return(length); } //如果串的长度大于姓氏的长度则开始检测有效的名字长度 if (length > nStart) { int nEnd = nStart + 1; AssociateStream assoStream = new AssociateStream(); //如果下一个字是连接字或者当前字符和下一个字无法组合成一个已知词则将当前字被确认为时姓名的一个字 if (LinkRule.Test(word[nEnd + 1]) || !(assoStream.Associate(word[nEnd]) && assoStream.Associate(word[nEnd + 1]))) { nEnd++; } nStart = nEnd; } return(nStart); }