Ejemplo n.º 1
0
        /// <summary>
        /// 切分一些特殊字符
        /// </summary>
        /// <returns></returns>
        private bool workSpecialChars()
        {
            /****************************************************
             * 处理以字母(包括希腊字母)数字英文标点开始的字符片段
             * e-mail,
             * addreess
             * mathexpression
             *  and so on
             * *******************************************************/
            if (Regex_Helper.Is_AN(_token.ToString()) || Regex_Helper.Is_Concrete_Number(_token.ToString()) || Regex_Helper.Is_English_Mark(_token.ToString()))
            {
                flexNumberAlpha(_token);
                return(true);
            }

            /*********************************
            * 处理未知字符开始的字符片段
            * 其他语言的文字 (日文,韩文等)
            * *******************************/
            if (!Regex_Helper.Is_ACN(_token.ToString()) && !Regex_Helper.Is_Mark(_token.ToString()))
            {
                flexUnknowChar(_token);
                return(true);
            }
            return(false);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 切割未知字符
        /// 如 japannese alpha
        /// arabic kraon
        /// </summary>
        /// <param name="ch"></param>
        private void flexUnknowChar(char ch)
        {
            var w = new _WordInnfo();

            var  name = ch.ToString();
            bool flag = false;

            while (hasNext)
            {
                var b = next();
                flag = true;
                if (!Regex_Helper.Is_ACN(b.ToString()) && !Regex_Helper.Is_Mark(b.ToString()))
                {
                    name += b;
                }
                else
                {
                    break;
                }
            }
            if (flag)
            {
                previous();
            }
            w.Name    = name;
            w.MaxType = WordType.Noun;
            _susentence.Words.Add(w);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 切割字母数字
        /// </summary>
        /// <param name="ch"></param>
        private void flexNumberAlpha(char ch)
        {
            var name = ch.ToString();

            /***************************
            * 用于确定是否回跳
            * 如果执行了 while 下面的语句
            * 需要回跳一个字符
            * *************************/
            bool flag = false;

            while (_currentPos < _context.Length - 1)
            {
                flag = true;
                var b = next();
                if (Regex_Helper.Is_AN(b.ToString()))
                {
                    name += b;
                }

                /********************************
                 * website address, math expression
                 * *************************************/
                else if (Regex_Helper.Is_Mark(b.ToString()))
                {
                    /**********************
                     * if chinese mark ens prosses
                     * *************************/
                    if (Regex_Helper.Is_Chinese_Mark(b.ToString()))
                    {
                        break;
                    }
                    name += b;
                }
                else
                {
                    break;
                }
            }

            var w = new _WordInnfo()
            {
                Name = name,
            };

            /**********************
             * 设置获得片段的词性
             * 可能是数词,也可能是其他(数学表达式,网址等)
             * ************************/
            SetAlphaNumberType(w);


            if (flag)
            {
                previous();
            }

            w.Name = name;
            _susentence.Words.Add(w);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// 反向分词函数,读取未知字符序列
        /// </summary>
        /// <param name="ls">反向分词结果的词链表</param>
        /// <param name="temp">要进行反向分词的字符串序列</param>
        /// <param name="i">当前扫描位置</param>
        private void reflexUnknowChars(List <_WordInnfo> ls, ref string temp, ref int i)
        {
            var temps = temp[i].ToString();
            var _w    = new _WordInnfo();

            for (int j = i + 1; j < temp.Length; j++, i++)
            {
                if (!Regex_Helper.Is_ACN(temp[j].ToString()) && !Regex_Helper.Is_Mark(temp[j].ToString()))
                {
                    temps += temp[j];
                }
                else
                {
                    i++;
                    break;
                }
            }
            _w.Name    = temps;
            _w.MaxType = WordType.Noun;
            ls.Add(_w);
            if (i != temp.Length - 1)
            {
                i--;
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// 反向分词函数,读取字母,数字序列
        /// </summary>
        /// <param name="ls">反向分词结果的词链表</param>
        /// <param name="temp">要进行反向分词的字符串序列</param>
        /// <param name="i">当前扫描位置</param>
        private void reflexNumberAlpha(List <_WordInnfo> ls, ref string temp, ref int i)
        {
            var temps = temp[i].ToString();
            var w     = new _WordInnfo();

            for (int j = i + 1; j < temp.Length; j++, i++)
            {
                if (Regex_Helper.Is_Number(temp[j].ToString()) || Regex_Helper.Is_Alpha(temp[j].ToString()))
                {
                    temps += temp[j];
                }
                else
                {
                    i++;
                    break;
                }
            }
            w.Name = temps;
            Lexicer.SetAlphaNumberType(w);
            ls.Add(w);
            if (i != temp.Length - 1)
            {
                i--;
            }
        }
Ejemplo n.º 6
0
            /// <summary>
            /// 将mathexpression 切分为 字母 数字 和符号的 序列
            /// </summary>
            /// <param name="str"></param>
            /// <returns></returns>
            public static List <string> FlexMathExpression(string str)
            {
                var ls   = new List <string>();
                var flag = -1;
                var temp = string.Empty;

                foreach (var item in str)
                {
                    if (Regex_Helper.Is_Alpha(item.ToString()))
                    {
                        if (flag == -1)
                        {
                            if (temp == string.Empty)
                            {
                                temp += item;
                                flag  = 0;
                            }
                        }
                        else if (flag == 1)
                        {
                            temp += item;
                        }
                        else
                        {
                            ls.Add(temp);
                            temp = item.ToString();
                            flag = 1;
                        }
                    }
                    else if (Regex_Helper.Is_Number(item.ToString()))
                    {
                        if (flag == -1)
                        {
                            flag  = 0;
                            temp += item;
                        }
                        else if (flag == 0)
                        {
                            temp += item;
                        }
                        else
                        {
                            ls.Add(temp);
                            temp += item;
                            flag  = 0;
                        }
                    }
                    else
                    {
                        ls.Add(temp);
                        ls.Add(item.ToString());
                        flag = -1;
                        temp = string.Empty;
                    }
                }

                ls.Add(temp);
                return(ls);
            }
Ejemplo n.º 7
0
            public static List <string> Get_All_Number_In_Mixed_String(string str)
            {
                var ls = new List <string>();

                foreach (var item in Flex.Flex_Mixed_Number_And_Other(str))
                {
                    if (Regex_Helper.Is_Number(item[item.Length - 1].ToString()))
                    {
                        ls.Add(item.Trim());
                    }
                }
                return(ls);
            }
Ejemplo n.º 8
0
        /// <summary>
        /// 判断类型
        /// reflexier也会使用 ,所以改成公开静态函数
        /// </summary>
        /// <param name="w"></param>
        public static void SetAlphaNumberType(_WordInnfo w)
        {
            var t = StringHelper.Count(w.Name, (x) => Regex_Helper.Is_Concrete_Number(x.ToString()));

            if (t == w.Name.Length)
            {
                w.MaxType = WordType.NumberConcrete;
            }
            else if (t == w.Name.Length - 1)
            {
                if (w.Name.Contains('.'.ToString()))
                {
                    w.MaxType = WordType.NumberConcrete;
                }
                else
                {
                    w.MaxType = WordType.NounAlphaNumberMark;
                }
            }
            else
            {
                w.MaxType = WordType.NounAlphaNumberMark;
            }
        }
Ejemplo n.º 9
0
 public String_Info(string str)
 {
     this.Type = Regex_Helper.Get_String_Type(str);
 }
Ejemplo n.º 10
0
 public static bool IsStopWord(this char ch)
 {
     return(IsFirstSingleWord(ch) || Regex_Helper.Is_AMN(ch.ToString()));
 }
Ejemplo n.º 11
0
        /// <summary>
        /// 分词
        /// </summary>
        /// <param name="temp"></param>
        /// <returns></returns>
        public List <_WordInnfo> Reflex(string temp)
        {
            var ls = new List <_WordInnfo>();

            /*******************
             * 获取有问题的字符串
             * **************/
            Search_Tree_Node <_WordInnfo> current_node = null;
            Search_Tree_Node <_WordInnfo> max_node     = null;
            int  searchTime          = 0;
            int  max_time            = 0;
            bool isSearchFromTheTree = true;

            for (int i = 0; i < temp.Length; i++)
            {
                /****************************
                 * 处理数字、字母、未知字符
                 * ****************************/
                if (searchTime == 0)
                {
                    /********************字母、数字*************************/
                    if (Regex_Helper.Is_Math_Expression(temp[i].ToString()))
                    {
                        reflexNumberAlpha(ls, ref temp, ref i);
                        continue;
                    }
                    /***********************未知字符********************************/
                    if (!Regex_Helper.Is_ACN(temp[i].ToString()) && !Regex_Helper.Is_Mark(temp[i].ToString()))
                    {
                        reflexUnknowChars(ls, ref temp, ref i);
                        continue;
                    }
                }

                /******************
                 * 增加搜索次数
                 * *********************/
                searchTime++;
                if (isSearchFromTheTree)
                {
                    current_node = _provider.NegtiveDic.Get_Node(temp[i].ToString());

                    isSearchFromTheTree = false;

                    /***************
                     * 是否结束分词
                     * ********************/
                    if (current_node == null)
                    {
                        endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node);
                    }
                    else

                    /*************
                     * 是否更新maxnode
                     * ********************/
                    if (!current_node.Is_Empty)
                    {
                        max_node = current_node;
                        max_time = searchTime;
                    }
                }
                else
                {
                    current_node = current_node.Get_Child(temp[i]);

                    /***************
                     * 是否结束分词
                     * ********************/
                    if (current_node == null)
                    {
                        endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node);
                    }

                    else

                    /*************
                     * 是否更新maxnode
                     * ********************/
                    if (!current_node.Is_Empty)
                    {
                        max_node = current_node;
                        max_time = searchTime - 1;
                    }
                }
            }

            /******************
             * 扫描结束处理
             * **********************/

            afterReflex(ls, current_node, max_node, temp);

            /****************反转链表******************/
            reverseList(ls);
            return(ls);
        }