/// <summary> /// 切割未知字符 /// 如 japannese alpha /// arabic kraon /// </summary> /// <param name="ch"></param> private void flexUnknowChar(char ch) { var w = new _WordInnfo(); var name = ch.ToString(); bool flag = false; while (hasNext) { var b = next(); flag = true; if (!Regex_Helper.Is_ACN(b.ToString()) && !Regex_Helper.Is_Mark(b.ToString())) { name += b; } else { break; } } if (flag) { previous(); } w.Name = name; w.MaxType = WordType.Noun; _susentence.Words.Add(w); }
/// <summary> /// 切分一些特殊字符 /// </summary> /// <returns></returns> private bool workSpecialChars() { /**************************************************** * 处理以字母(包括希腊字母)数字英文标点开始的字符片段 * e-mail, * addreess * mathexpression * and so on * *******************************************************/ if (Regex_Helper.Is_AN(_token.ToString()) || Regex_Helper.Is_Concrete_Number(_token.ToString()) || Regex_Helper.Is_English_Mark(_token.ToString())) { flexNumberAlpha(_token); return(true); } /********************************* * 处理未知字符开始的字符片段 * 其他语言的文字 (日文,韩文等) * *******************************/ if (!Regex_Helper.Is_ACN(_token.ToString()) && !Regex_Helper.Is_Mark(_token.ToString())) { flexUnknowChar(_token); return(true); } return(false); }
/// <summary> /// 反向分词函数,读取未知字符序列 /// </summary> /// <param name="ls">反向分词结果的词链表</param> /// <param name="temp">要进行反向分词的字符串序列</param> /// <param name="i">当前扫描位置</param> private void reflexUnknowChars(List <_WordInnfo> ls, ref string temp, ref int i) { var temps = temp[i].ToString(); var _w = new _WordInnfo(); for (int j = i + 1; j < temp.Length; j++, i++) { if (!Regex_Helper.Is_ACN(temp[j].ToString()) && !Regex_Helper.Is_Mark(temp[j].ToString())) { temps += temp[j]; } else { i++; break; } } _w.Name = temps; _w.MaxType = WordType.Noun; ls.Add(_w); if (i != temp.Length - 1) { i--; } }
/// <summary> /// 分词 /// </summary> /// <param name="temp"></param> /// <returns></returns> public List <_WordInnfo> Reflex(string temp) { var ls = new List <_WordInnfo>(); /******************* * 获取有问题的字符串 * **************/ Search_Tree_Node <_WordInnfo> current_node = null; Search_Tree_Node <_WordInnfo> max_node = null; int searchTime = 0; int max_time = 0; bool isSearchFromTheTree = true; for (int i = 0; i < temp.Length; i++) { /**************************** * 处理数字、字母、未知字符 * ****************************/ if (searchTime == 0) { /********************字母、数字*************************/ if (Regex_Helper.Is_Math_Expression(temp[i].ToString())) { reflexNumberAlpha(ls, ref temp, ref i); continue; } /***********************未知字符********************************/ if (!Regex_Helper.Is_ACN(temp[i].ToString()) && !Regex_Helper.Is_Mark(temp[i].ToString())) { reflexUnknowChars(ls, ref temp, ref i); continue; } } /****************** * 增加搜索次数 * *********************/ searchTime++; if (isSearchFromTheTree) { current_node = _provider.NegtiveDic.Get_Node(temp[i].ToString()); isSearchFromTheTree = false; /*************** * 是否结束分词 * ********************/ if (current_node == null) { endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node); } else /************* * 是否更新maxnode * ********************/ if (!current_node.Is_Empty) { max_node = current_node; max_time = searchTime; } } else { current_node = current_node.Get_Child(temp[i]); /*************** * 是否结束分词 * ********************/ if (current_node == null) { endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node); } else /************* * 是否更新maxnode * ********************/ if (!current_node.Is_Empty) { max_node = current_node; max_time = searchTime - 1; } } } /****************** * 扫描结束处理 * **********************/ afterReflex(ls, current_node, max_node, temp); /****************反转链表******************/ reverseList(ls); return(ls); }