/// <summary> /// 处理最后一个字符 /// </summary> /// <param name="text"></param> /// <param name="currentIndex"></param> /// <param name="result"></param> private void ProcessLastChar(string text, int currentIndex, AuxiliaryString auxString, string lastMaxLenElement) { //如果最后一个字符还没有处理 if (currentIndex < text.Length && !lastMaxLenElement.EndsWith(text.Substring(text.Length - 1))) { auxString.Append(text.Substring(text.Length - 1),currentIndex); auxString.Append(this.Separator); } }
/// <summary> /// 处理非汉字 /// </summary> /// <param name="text"></param> /// <param name="result"></param> /// <param name="currentChar"></param> /// <param name="currentIndex"></param> private void ProcessNonChinese(string text, AuxiliaryString auxString, string currentChar, string nextChar, int currentIndex) { if (currentChar == " ") { //当前是空格 } else { if (nextChar == " ") { auxString.Append(currentChar, currentIndex); auxString.Append(this.Separator); } else { if ((Character.IsLetter(currentChar) && !Character.IsLetter(nextChar)) || Character.IsNumber(currentChar) && Character.IsLetter(nextChar)) { //当前字符是字母,且下一字符不是字母。 //或者当前是数字,且下一字符是字母 auxString.Append(currentChar, currentIndex); auxString.Append(this.Separator); } else { //当前字符是数字 auxString.Append(currentChar, currentIndex); } } } }
/// <summary> /// 以当前两字符开头的词,在词库中存在,取出所有匹配的词并抽取最大程度的词。 /// </summary> /// <param name="text"></param> /// <param name="result"></param> /// <param name="currentTwoChars"></param> /// <param name="currentIndex"></param> /// <returns></returns> private void ProcessExistingElement(string text, AuxiliaryString auxString, string currentChar, string currentTwoChars, ref int currentIndex, ref string maxLenElement) { int maxlen = this.SegmentDictionary.SegmentKeys[currentChar][currentTwoChars] > (text.Length - currentIndex) ? (text.Length - currentIndex) : this.SegmentDictionary.SegmentKeys[currentChar][currentTwoChars]; bool exist = false; //找到以当前字符开头存在于词典中的最长词 for (int i = maxlen; i >= 2; i--) { string element = text.Substring(currentIndex, i); if (this.SegmentDictionary.SegmentDict.ContainsKey(element)) { maxLenElement = element; exist = true; break; } } if (!exist)//没有找到,不存在以当前字符为首的词 { auxString.Append(currentChar, currentIndex); auxString.Append(this.Separator); return; } if (!auxString.NeedAddPos && currentIndex > 0) { auxString.Append(currentTwoChars, currentIndex); auxString.Append(this.Separator); } //找到,分解这个最长词 string strLeave = maxLenElement; bool remove = false; int minLen = 2; int startpos = currentIndex; while (strLeave.Length >= 2) { remove = false; for (int j = 2; j <= strLeave.Length; j++) { string subElement = strLeave.Substring(0, j); if (this.SegmentDictionary.SegmentDict.ContainsKey(subElement)) { auxString.Append(subElement,startpos); auxString.Append(this.Separator); if (!remove) { remove = true; minLen = subElement.Length; } } } strLeave = strLeave.Substring(minLen); startpos += minLen; } currentIndex += maxLenElement.Length - 1; }
/// <summary> /// 对单个句子分词。 /// </summary> /// <param name="text"></param> /// <param name="result"></param> protected override void SegmentSentence(string text, ref StringBuilder result, out List<int> startList) { startList = new List<int>(); if (string.IsNullOrEmpty(text)) { return; } if (this.SegmentDictionary.Segments.Count <= 0) { return; } string currentChar; string nextChar; string currentTwoChars; string maxLenElement = ""; int charCursor = 0; AuxiliaryString auxString = new AuxiliaryString(); for (charCursor = 0; charCursor < text.Length - 1; charCursor++) { currentChar = text.Substring(charCursor, 1); nextChar = text.Substring(charCursor + 1, 1); //非汉字 if (!Character.IsChinese(currentChar)) { ProcessNonChinese(text, auxString, currentChar, nextChar, charCursor); continue; } //下一字符非汉字 if (!Character.IsChinese(nextChar)) { auxString.Append(currentChar,charCursor); auxString.Append(this.Separator); continue; } //汉字 currentTwoChars = text.Substring(charCursor, 2); //以当前两字符开头的词,在词库中不存在 if (!this.SegmentDictionary.Segments.ContainsKey(currentTwoChars)) { //当前字符是姓 if (this.NameSegments.ContainsKey(currentChar)) { auxString.Append(currentChar,charCursor); } else if (this.NameSegments.ContainsKey(currentTwoChars)) { auxString.Append(currentTwoChars,charCursor); } else { auxString.Append(currentChar,charCursor); auxString.Append(this.Separator); } continue; } //以当前两字符开头的词,在词库中存在,取出所有匹配的词并抽取最大程度的词。 ProcessExistingElement(text, auxString, currentChar, currentTwoChars, ref charCursor, ref maxLenElement); } //处理最后一个字符 ProcessLastChar(text, charCursor, auxString, maxLenElement); result = auxString.Builder; startList = auxString.StartList; }