/// <summary> /// 切分一些特殊字符 /// </summary> /// <returns></returns> private bool workSpecialChars() { /**************************************************** * 处理以字母(包括希腊字母)数字英文标点开始的字符片段 * e-mail, * addreess * mathexpression * and so on * *******************************************************/ if (Regex_Helper.Is_AN(_token.ToString()) || Regex_Helper.Is_Concrete_Number(_token.ToString()) || Regex_Helper.Is_English_Mark(_token.ToString())) { flexNumberAlpha(_token); return(true); } /********************************* * 处理未知字符开始的字符片段 * 其他语言的文字 (日文,韩文等) * *******************************/ if (!Regex_Helper.Is_ACN(_token.ToString()) && !Regex_Helper.Is_Mark(_token.ToString())) { flexUnknowChar(_token); return(true); } return(false); }
/// <summary> /// 切割未知字符 /// 如 japannese alpha /// arabic kraon /// </summary> /// <param name="ch"></param> private void flexUnknowChar(char ch) { var w = new _WordInnfo(); var name = ch.ToString(); bool flag = false; while (hasNext) { var b = next(); flag = true; if (!Regex_Helper.Is_ACN(b.ToString()) && !Regex_Helper.Is_Mark(b.ToString())) { name += b; } else { break; } } if (flag) { previous(); } w.Name = name; w.MaxType = WordType.Noun; _susentence.Words.Add(w); }
/// <summary> /// 切割字母数字 /// </summary> /// <param name="ch"></param> private void flexNumberAlpha(char ch) { var name = ch.ToString(); /*************************** * 用于确定是否回跳 * 如果执行了 while 下面的语句 * 需要回跳一个字符 * *************************/ bool flag = false; while (_currentPos < _context.Length - 1) { flag = true; var b = next(); if (Regex_Helper.Is_AN(b.ToString())) { name += b; } /******************************** * website address, math expression * *************************************/ else if (Regex_Helper.Is_Mark(b.ToString())) { /********************** * if chinese mark ens prosses * *************************/ if (Regex_Helper.Is_Chinese_Mark(b.ToString())) { break; } name += b; } else { break; } } var w = new _WordInnfo() { Name = name, }; /********************** * 设置获得片段的词性 * 可能是数词,也可能是其他(数学表达式,网址等) * ************************/ SetAlphaNumberType(w); if (flag) { previous(); } w.Name = name; _susentence.Words.Add(w); }
/// <summary> /// 反向分词函数,读取未知字符序列 /// </summary> /// <param name="ls">反向分词结果的词链表</param> /// <param name="temp">要进行反向分词的字符串序列</param> /// <param name="i">当前扫描位置</param> private void reflexUnknowChars(List <_WordInnfo> ls, ref string temp, ref int i) { var temps = temp[i].ToString(); var _w = new _WordInnfo(); for (int j = i + 1; j < temp.Length; j++, i++) { if (!Regex_Helper.Is_ACN(temp[j].ToString()) && !Regex_Helper.Is_Mark(temp[j].ToString())) { temps += temp[j]; } else { i++; break; } } _w.Name = temps; _w.MaxType = WordType.Noun; ls.Add(_w); if (i != temp.Length - 1) { i--; } }
/// <summary> /// 反向分词函数,读取字母,数字序列 /// </summary> /// <param name="ls">反向分词结果的词链表</param> /// <param name="temp">要进行反向分词的字符串序列</param> /// <param name="i">当前扫描位置</param> private void reflexNumberAlpha(List <_WordInnfo> ls, ref string temp, ref int i) { var temps = temp[i].ToString(); var w = new _WordInnfo(); for (int j = i + 1; j < temp.Length; j++, i++) { if (Regex_Helper.Is_Number(temp[j].ToString()) || Regex_Helper.Is_Alpha(temp[j].ToString())) { temps += temp[j]; } else { i++; break; } } w.Name = temps; Lexicer.SetAlphaNumberType(w); ls.Add(w); if (i != temp.Length - 1) { i--; } }
/// <summary> /// 将mathexpression 切分为 字母 数字 和符号的 序列 /// </summary> /// <param name="str"></param> /// <returns></returns> public static List <string> FlexMathExpression(string str) { var ls = new List <string>(); var flag = -1; var temp = string.Empty; foreach (var item in str) { if (Regex_Helper.Is_Alpha(item.ToString())) { if (flag == -1) { if (temp == string.Empty) { temp += item; flag = 0; } } else if (flag == 1) { temp += item; } else { ls.Add(temp); temp = item.ToString(); flag = 1; } } else if (Regex_Helper.Is_Number(item.ToString())) { if (flag == -1) { flag = 0; temp += item; } else if (flag == 0) { temp += item; } else { ls.Add(temp); temp += item; flag = 0; } } else { ls.Add(temp); ls.Add(item.ToString()); flag = -1; temp = string.Empty; } } ls.Add(temp); return(ls); }
public static List <string> Get_All_Number_In_Mixed_String(string str) { var ls = new List <string>(); foreach (var item in Flex.Flex_Mixed_Number_And_Other(str)) { if (Regex_Helper.Is_Number(item[item.Length - 1].ToString())) { ls.Add(item.Trim()); } } return(ls); }
/// <summary> /// 判断类型 /// reflexier也会使用 ,所以改成公开静态函数 /// </summary> /// <param name="w"></param> public static void SetAlphaNumberType(_WordInnfo w) { var t = StringHelper.Count(w.Name, (x) => Regex_Helper.Is_Concrete_Number(x.ToString())); if (t == w.Name.Length) { w.MaxType = WordType.NumberConcrete; } else if (t == w.Name.Length - 1) { if (w.Name.Contains('.'.ToString())) { w.MaxType = WordType.NumberConcrete; } else { w.MaxType = WordType.NounAlphaNumberMark; } } else { w.MaxType = WordType.NounAlphaNumberMark; } }
public String_Info(string str) { this.Type = Regex_Helper.Get_String_Type(str); }
public static bool IsStopWord(this char ch) { return(IsFirstSingleWord(ch) || Regex_Helper.Is_AMN(ch.ToString())); }
/// <summary> /// 分词 /// </summary> /// <param name="temp"></param> /// <returns></returns> public List <_WordInnfo> Reflex(string temp) { var ls = new List <_WordInnfo>(); /******************* * 获取有问题的字符串 * **************/ Search_Tree_Node <_WordInnfo> current_node = null; Search_Tree_Node <_WordInnfo> max_node = null; int searchTime = 0; int max_time = 0; bool isSearchFromTheTree = true; for (int i = 0; i < temp.Length; i++) { /**************************** * 处理数字、字母、未知字符 * ****************************/ if (searchTime == 0) { /********************字母、数字*************************/ if (Regex_Helper.Is_Math_Expression(temp[i].ToString())) { reflexNumberAlpha(ls, ref temp, ref i); continue; } /***********************未知字符********************************/ if (!Regex_Helper.Is_ACN(temp[i].ToString()) && !Regex_Helper.Is_Mark(temp[i].ToString())) { reflexUnknowChars(ls, ref temp, ref i); continue; } } /****************** * 增加搜索次数 * *********************/ searchTime++; if (isSearchFromTheTree) { current_node = _provider.NegtiveDic.Get_Node(temp[i].ToString()); isSearchFromTheTree = false; /*************** * 是否结束分词 * ********************/ if (current_node == null) { endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node); } else /************* * 是否更新maxnode * ********************/ if (!current_node.Is_Empty) { max_node = current_node; max_time = searchTime; } } else { current_node = current_node.Get_Child(temp[i]); /*************** * 是否结束分词 * ********************/ if (current_node == null) { endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node); } else /************* * 是否更新maxnode * ********************/ if (!current_node.Is_Empty) { max_node = current_node; max_time = searchTime - 1; } } } /****************** * 扫描结束处理 * **********************/ afterReflex(ls, current_node, max_node, temp); /****************反转链表******************/ reverseList(ls); return(ls); }