示例#1
0
        /// <summary>
        /// 判断是否应该粗略结合当前字符到当前roughchunk中
        /// 对地址判断不准确,因为地址值包含中文,数字,符号,字母多种组合
        /// </summary>
        /// <param name="chunk_type">当前roughchunk的字符类型</param>
        /// <param name="cur_type">当前字符的类型</param>
        /// <param name="i">当前字符在原始文本中的位置</param>
        /// <param name="input">原始文本</param>
        /// <returns>小于0:不结合;大于0:结合后的块字符类型</returns>
        private static int ShouldCombineRoughly(int chunk_type, int cur_type, int i, string input)
        {
            var and = chunk_type & cur_type;

            if (and > 0)
            {
                return(chunk_type);
            }

            if (chunk_type == CharUtil.Space_T || cur_type == CharUtil.Space_T)             // 首次遇到空格,进行切分
            {
                return(-1);
            }

            if ((chunk_type & (CharUtil.Number_T | CharUtil.Alphbet_T)) > 0)        // 字母数字符号混合,但必须以字母或数字开头
            {
                if (cur_type == CharUtil.Alphbet_T || cur_type == CharUtil.Number_T || cur_type == CharUtil.Punctuation_T)
                {
                    return(chunk_type | cur_type);
                }
            }

            if (cur_type == CharUtil.EmbedPunctuation_T)     // 当前字符是可嵌入符号
            {
                var curr_c = CharUtil.ToHalfAngle(input[i]);
                if (curr_c == ')' || curr_c == ']')        // 当前符号是关括号,则合并
                {
                    return(chunk_type | CharUtil.EmbedPunctuation_T);
                }

                if (i < input.Length - 1)
                {
                    var next_type = CharUtil.GetType(CharUtil.ToHalfAngle(input[i + 1]));
                    and = chunk_type & next_type;                   // 前后符号一致,则合并
                    if (and > 0)
                    {
                        return(chunk_type | CharUtil.EmbedPunctuation_T);
                    }
                }
            }
            else if (chunk_type == CharUtil.EmbedPunctuation_T)      // 当前块是可嵌入符号类型
            {
                var prev_c = CharUtil.ToHalfAngle(input[i - 1]);
                if (prev_c == '(' || prev_c == '[')                 // 上一个符号是开括号
                {
                    return(cur_type | CharUtil.EmbedPunctuation_T); // 合并
                }
            }

            return(-1);      // 不合并
        }
示例#2
0
        public static List <RoughChunk> RoughSplit(string input)
        {
            if (string.IsNullOrWhiteSpace(input))
            {
                return(null);
            }

            var list = new List <RoughChunk>();
            var sb   = new StringBuilder(input.Length); // 缓存当前roughchunk的文本内容
            int i    = 0;                               // input当前处理字符位置


            var c          = CharUtil.ToHalfAngle(input[0]);
            var cur_type   = CharUtil.GetType(c);
            int chunk_type = cur_type;          // 当前块字符类型

            while (i < input.Length)
            {
                var new_type = ShouldCombineRoughly(chunk_type, cur_type, i, input);
                while (new_type > 0)
                {
                    sb.Append(c);
                    chunk_type = new_type;
                    i++;
                    if (i == input.Length)
                    {
                        break;
                    }
                    c        = CharUtil.ToHalfAngle(input[i]);
                    cur_type = CharUtil.GetType(c);
                    new_type = ShouldCombineRoughly(chunk_type, cur_type, i, input);
                }

                if (i == input.Length)   // 结束
                {
                    list.Add(new RoughChunk(sb.ToString(), i - sb.Length, chunk_type));
                    return(list);
                }
                else
                {
                    list.Add(new RoughChunk(sb.ToString(), i - sb.Length, chunk_type));
                    sb.Clear();
                }
                chunk_type = cur_type;
            }
            return(list);
        }