Ejemplo n.º 1
0
        /// <summary>
        /// 切割未知字符
        /// 如 japannese alpha
        /// arabic kraon
        /// </summary>
        /// <param name="ch"></param>
        private void flexUnknowChar(char ch)
        {
            var w = new _WordInnfo();

            var  name = ch.ToString();
            bool flag = false;

            while (hasNext)
            {
                var b = next();
                flag = true;
                if (!Regex_Helper.Is_ACN(b.ToString()) && !Regex_Helper.Is_Mark(b.ToString()))
                {
                    name += b;
                }
                else
                {
                    break;
                }
            }
            if (flag)
            {
                previous();
            }
            w.Name    = name;
            w.MaxType = WordType.Noun;
            _susentence.Words.Add(w);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 切割字母数字
        /// </summary>
        /// <param name="ch"></param>
        private void flexNumberAlpha(char ch)
        {
            var name = ch.ToString();

            /***************************
            * 用于确定是否回跳
            * 如果执行了 while 下面的语句
            * 需要回跳一个字符
            * *************************/
            bool flag = false;

            while (_currentPos < _context.Length - 1)
            {
                flag = true;
                var b = next();
                if (Regex_Helper.Is_AN(b.ToString()))
                {
                    name += b;
                }

                /********************************
                 * website address, math expression
                 * *************************************/
                else if (Regex_Helper.Is_Mark(b.ToString()))
                {
                    /**********************
                     * if chinese mark ens prosses
                     * *************************/
                    if (Regex_Helper.Is_Chinese_Mark(b.ToString()))
                    {
                        break;
                    }
                    name += b;
                }
                else
                {
                    break;
                }
            }

            var w = new _WordInnfo()
            {
                Name = name,
            };

            /**********************
             * 设置获得片段的词性
             * 可能是数词,也可能是其他(数学表达式,网址等)
             * ************************/
            SetAlphaNumberType(w);


            if (flag)
            {
                previous();
            }

            w.Name = name;
            _susentence.Words.Add(w);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 切分一些特殊字符
        /// </summary>
        /// <returns></returns>
        private bool workSpecialChars()
        {
            /****************************************************
             * 处理以字母(包括希腊字母)数字英文标点开始的字符片段
             * e-mail,
             * addreess
             * mathexpression
             *  and so on
             * *******************************************************/
            if (Regex_Helper.Is_AN(_token.ToString()) || Regex_Helper.Is_Concrete_Number(_token.ToString()) || Regex_Helper.Is_English_Mark(_token.ToString()))
            {
                flexNumberAlpha(_token);
                return(true);
            }

            /*********************************
            * 处理未知字符开始的字符片段
            * 其他语言的文字 (日文,韩文等)
            * *******************************/
            if (!Regex_Helper.Is_ACN(_token.ToString()) && !Regex_Helper.Is_Mark(_token.ToString()))
            {
                flexUnknowChar(_token);
                return(true);
            }
            return(false);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// 反向分词函数,读取未知字符序列
        /// </summary>
        /// <param name="ls">反向分词结果的词链表</param>
        /// <param name="temp">要进行反向分词的字符串序列</param>
        /// <param name="i">当前扫描位置</param>
        private void reflexUnknowChars(List <_WordInnfo> ls, ref string temp, ref int i)
        {
            var temps = temp[i].ToString();
            var _w    = new _WordInnfo();

            for (int j = i + 1; j < temp.Length; j++, i++)
            {
                if (!Regex_Helper.Is_ACN(temp[j].ToString()) && !Regex_Helper.Is_Mark(temp[j].ToString()))
                {
                    temps += temp[j];
                }
                else
                {
                    i++;
                    break;
                }
            }
            _w.Name    = temps;
            _w.MaxType = WordType.Noun;
            ls.Add(_w);
            if (i != temp.Length - 1)
            {
                i--;
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// 分词
        /// </summary>
        /// <param name="temp"></param>
        /// <returns></returns>
        public List <_WordInnfo> Reflex(string temp)
        {
            var ls = new List <_WordInnfo>();

            /*******************
             * 获取有问题的字符串
             * **************/
            Search_Tree_Node <_WordInnfo> current_node = null;
            Search_Tree_Node <_WordInnfo> max_node     = null;
            int  searchTime          = 0;
            int  max_time            = 0;
            bool isSearchFromTheTree = true;

            for (int i = 0; i < temp.Length; i++)
            {
                /****************************
                 * 处理数字、字母、未知字符
                 * ****************************/
                if (searchTime == 0)
                {
                    /********************字母、数字*************************/
                    if (Regex_Helper.Is_Math_Expression(temp[i].ToString()))
                    {
                        reflexNumberAlpha(ls, ref temp, ref i);
                        continue;
                    }
                    /***********************未知字符********************************/
                    if (!Regex_Helper.Is_ACN(temp[i].ToString()) && !Regex_Helper.Is_Mark(temp[i].ToString()))
                    {
                        reflexUnknowChars(ls, ref temp, ref i);
                        continue;
                    }
                }

                /******************
                 * 增加搜索次数
                 * *********************/
                searchTime++;
                if (isSearchFromTheTree)
                {
                    current_node = _provider.NegtiveDic.Get_Node(temp[i].ToString());

                    isSearchFromTheTree = false;

                    /***************
                     * 是否结束分词
                     * ********************/
                    if (current_node == null)
                    {
                        endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node);
                    }
                    else

                    /*************
                     * 是否更新maxnode
                     * ********************/
                    if (!current_node.Is_Empty)
                    {
                        max_node = current_node;
                        max_time = searchTime;
                    }
                }
                else
                {
                    current_node = current_node.Get_Child(temp[i]);

                    /***************
                     * 是否结束分词
                     * ********************/
                    if (current_node == null)
                    {
                        endReflex(ref temp, ls, ref i, ref searchTime, ref max_time, ref isSearchFromTheTree, current_node, ref max_node);
                    }

                    else

                    /*************
                     * 是否更新maxnode
                     * ********************/
                    if (!current_node.Is_Empty)
                    {
                        max_node = current_node;
                        max_time = searchTime - 1;
                    }
                }
            }

            /******************
             * 扫描结束处理
             * **********************/

            afterReflex(ls, current_node, max_node, temp);

            /****************反转链表******************/
            reverseList(ls);
            return(ls);
        }