Пример #1
0
        public ParseResultCollection Parse(int startIndex)
        {
            string                _text   = context.Text;
            ParserPattern         _format = context.Pattern;
            char                  ch;
            int                   i   = startIndex;
            StringBuilder         sb  = new StringBuilder(6);
            ParseResultCollection prc = new ParseResultCollection();

            ch = _text[i];
            while (NumeralUtil.IsArabicNumeral(ch) || (ch >= '0' && ch <= '9') && i < _text.Length)
            {
                sb.Append(ch);
                ch = _text[++i];
            }
            string source = sb.ToString();

            if (_format == ParserPattern.China)
            {
                if (source.Length != 6)
                {
                    return(prc);
                }
            }
            else if (_format == ParserPattern.NorthAmerica)
            {
                if (source.Length != 5)
                {
                    return(prc);
                }
            }
            prc.Add(ParseResult.Create(source.ToString(), startIndex, POSType.A_M));

            return(prc);
        }
Пример #2
0
        public ParseResultCollection Parse(int startIndex)
        {
            string _text = context.Text;
            ParseResultCollection prc = new ParseResultCollection();

            int  i  = startIndex;
            char ch = _text[i];

            while ((NumeralUtil.IsArabicNumeral(ch) || NumeralUtil.IsChineseNumeralChars(ch) || ch == '.') &&
                   i + 1 < _text.Length)
            {
                if (i == startIndex && (NumeralUtil.IsChineseGenDigit(ch) && ch != '十'))   //首字出现进位符
                {
                    return(prc);
                }
                ch = _text[++i];
            }
            if (i == startIndex)
            {
                return(prc);
            }
            int j = Math.Min(i, _text.Length);

            if (IsChineseQuantity(_text[j]))
            {
                prc.Add(ParseResult.Create(_text.Substring(startIndex, i - startIndex), startIndex, POSType.A_M));
                prc.Add(ParseResult.Create(_text[i].ToString(), i, POSType.A_Q));
            }
            return(prc);
        }
Пример #3
0
        public void TestIsEnglishNumeral()
        {
            string[] enNumText = new string[] { "one", "two", "three", "four", "five", "six", "steven", "eight", "nine", "ten",
                                                "billion", "million", "thousand" };

            foreach (string str in enNumText)
            {
                Assert.IsTrue(NumeralUtil.IsEnglishNumeral(str), str);
            }
        }
Пример #4
0
        public void TestIsChineseNumeral()
        {
            char[] chnGenText = new char[] { '零', '一', '二', '三', '四', '五', '六', '七', '八', '九' };

            foreach (char ch in chnGenText)
            {
                Assert.IsTrue(NumeralUtil.IsChineseNumeral(ch));
            }
            char[] chnRMBText = new char[] { '零', '壹', '贰', '叁', '肆', '伍', '陆', '染', '捌', '玖' };
            foreach (char ch in chnRMBText)
            {
                Assert.IsTrue(NumeralUtil.IsChineseNumeral(ch));
            }
            Assert.IsFalse(NumeralUtil.IsChineseNumeral('7'));
        }
Пример #5
0
        public void TestConvertChineseNumeral2Arabic()
        {
            Assert.AreEqual(
                NumeralUtil.ConvertChineseNumeral2Arabic("你好,这里有二百三十五块钱,收好了,总共二千二百五十二块。二减五等于负三万零五十。"),
                "你好,这里有235块钱,收好了,总共2252块。2减5等于-30050。"
                );

            Assert.AreEqual(
                NumeralUtil.ConvertChineseNumeral2Arabic("公元二零零五年四月"),
                "公元2005年4月"
                );
            Assert.AreEqual(
                NumeralUtil.ConvertChineseNumeral2Arabic("公元前四五五年"),
                "公元前455年"
                );
        }
Пример #6
0
        public ParseResultCollection Parse(int startIndex)
        {
            string _text = context.Text;
            ParseResultCollection prc = new ParseResultCollection();
            string temp = _text.Substring(startIndex, Math.Min(maxChineseAddressLength, _text.Length - startIndex));

            char[] chars = temp.ToCharArray();
            //int lastStartPos = 0;
            StringBuilder  sb       = new StringBuilder();
            StringBuilder  whole    = new StringBuilder();
            ChineseAddress ca       = new ChineseAddress();
            int            startpos = 0;

            //TODO: 通过字典找国家名
            if (temp.StartsWith("中国"))
            {
                startpos   = 2;
                ca.country = "中国";
                whole.Append("中国");
            }
            for (int i = startpos; i < chars.Length; i++)
            {
                char ch = chars[i];
                if (ch == '市' || ch == '场')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    string subStr = sb.ToString();
                    string city   = GetMaximumMatch(subStr, 0, 5);
                    if (city != null)
                    {
                        ca.city = city;
                        whole.Append(ca.city);
                        sb = new StringBuilder();
                    }
                }
                else if (ch == '区')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    string subStr = sb.ToString();

                    string district = GetMaximumMatch(subStr, 0, 5);
                    if (district != null)
                    {
                        if (!district.EndsWith("区"))
                        {
                            ca.city = district;
                            whole.Append(ca.city);
                            ca.district = subStr.Substring(ca.city.Length);
                            whole.Append(ca.district);
                        }
                        else
                        {
                            //string district = NEParser.GetMaximumMatch(subStr, 0, 5, "district", _cityNames, null);
                            ca.district = district;
                            whole.Append(ca.district);
                        }
                    }
                    else
                    {
                        ca.district = subStr;
                        whole.Append(ca.district);
                    }
                    sb = new StringBuilder();
                }
                else if (ch == '省')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    string subStr   = sb.ToString();
                    string province = GetMaximumMatch(subStr, 0, 5);    //省份
                    if (province != null)
                    {
                        ca.province = province;
                        whole.Append(ca.province);
                        sb = new StringBuilder();
                    }
                }
                else if (ch == '乡' || ch == '村' || ch == '县' || ch == '镇')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    ca.county = sb.ToString();
                    whole.Append(ca.county);
                    sb = new StringBuilder();
                }
                else if (ch == '巷')
                {
                }
                else if (ch == '楼' || ch == '弄' || ch == '号' || ch == '室')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    string substr = NumeralUtil.ConvertChineseNumeral2Arabic(sb.ToString());
                    int    x;
                    sb.Append(ch);
                    if (Int32.TryParse(substr, out x))
                    {
                        if (ch == '楼')
                        {
                            ca.floor = sb.ToString();
                        }
                        else if (ch == '弄')
                        {
                            ca.lane = sb.ToString();
                        }
                        else if (ch == '号')
                        {
                            ca.no = sb.ToString();
                        }
                        else if (ch == '室')
                        {
                            ca.room = sb.ToString();
                        }
                        whole.Append(sb.ToString());
                        sb = new StringBuilder();
                    }
                }
                else if (ch == '道' || ch == '路' || ch == '街')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    ca.street = sb.ToString();
                    whole.Append(ca.street);
                    sb = new StringBuilder();
                }
                else if (ch == '(' || ch == '(')
                {
                    sb = new StringBuilder();
                    sb.Append(ch);
                }
                else if (ch == ')' || ch == ')')
                {
                    sb.Append(ch);
                    string extra1 = sb.ToString();
                    whole.Append(extra1);
                    ca.extra = extra1;
                    sb       = new StringBuilder();
                }
                else if (CharacterUtil.IsChinesePunctuation(ch) || (ch == ' ' || ch == ' '))
                {
                    break;
                }
                else if (ch == '大')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    if (i + 1 < chars.Length)
                    {
                        char nextchar = chars[i + 1];

                        if (nextchar == '桥' || nextchar == '厦')
                        {
                            string extra1 = sb.ToString() + "大" + nextchar;
                            whole.Append(extra1);
                            if (nextchar == '桥')
                            {
                                ca.extra += extra1;
                            }
                            else
                            {
                                ca.building = extra1;
                            }
                            i += 2 - 1;
                            sb = new StringBuilder();
                        }
                        else if (i + 2 < chars.Length && nextchar == '酒')
                        {
                            char nextchar2 = chars[i + 2];

                            if (nextchar2 == '店')
                            {
                                string extra1 = sb.ToString() + "大" + nextchar + nextchar2;
                                string city   = GetMaximumMatch(extra1, 0, 5);  //城市或省份
                                if (city != null)
                                {
                                    ca.city = city;
                                    whole.Append(ca.city);

                                    extra1 = extra1.Substring(ca.city.Length);
                                }
                                whole.Append(extra1);
                                ca.building = extra1;
                                i          += 3 - 1;
                                sb          = new StringBuilder();
                            }
                        }
                    }
                }
                else if (ch == '餐')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    if (i + 1 < chars.Length)
                    {
                        char nextchar = chars[i + 1];
                        if (nextchar == '厅')
                        {
                            string extra1 = sb.ToString() + "餐" + nextchar;
                            whole.Append(extra1);
                            ca.extra += extra1;
                            i        += 2 - 1;
                            sb        = new StringBuilder();
                        }
                    }
                }
                else
                {
                    //if (sb.Length == 0)
                    //    lastStartPos = i;
                    sb.Append(ch);
                    string extra = sb.ToString();
                    if (extra.EndsWith("中心") || extra.EndsWith("酒店"))
                    {
                        string city = GetMaximumMatch(extra, 0, 5); //城市
                        if (city != null)
                        {
                            ca.city = city;
                            extra   = extra.Substring(city.Length);
                        }
                        ca.building = extra;
                        whole.Append(extra);
                        if (i + 2 < chars.Length && chars[i + 1] == '大' && chars[i + 2] == '厦')  //处理 "中心大厦"
                        {
                            ca.building += "大厦";
                            whole.Append("大厦");
                            i += 2;
                            sb = new StringBuilder();
                            continue;
                        }
                        sb = new StringBuilder();
                    }
                }
            }
            if (whole.Length > 0)
            {
                if (sb.Length > 0)
                {
                    ca.extra = sb.ToString();
                }
                prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.D_S, ca));
            }
            return(prc);
        }
Пример #7
0
 bool IsAllowedChar(char ch, ParserPattern pattern)
 {
     if (pattern == ParserPattern.China)
     {
         if (ch >= '0' && ch <= '9')
         {
             return(true);
         }
         if (NumeralUtil.IsArabicNumeral(ch))
         {
             return(true);
         }
         if (ch == '-' || ch == '-' || ch == '—')
         {
             return(true);
         }
         if (ch == '(' || ch == '(' || ch == ')' || ch == ')')
         {
             return(true);
         }
         if (ch == '+')
         {
             return(true);
         }
         if (ch == '#')
         {
             return(true);
         }
         if (ch == ' ' || ch == ' ')
         {
             return(true);
         }
     }
     else if (pattern == ParserPattern.NorthAmerica)
     {
         if (NumeralUtil.IsArabicNumeral(ch))
         {
             return(true);
         }
         if (ch == '-' || ch == '-')
         {
             return(true);
         }
         if (ch == '(' || ch == ')')
         {
             return(true);
         }
         if (ch == '+')
         {
             return(true);
         }
         if (ch == ' ')
         {
             return(true);
         }
         if (ch == 'e' || ch == 'x' || ch == 't' || ch == '.')
         {
             return(true);
         }
     }
     return(false);
 }
Пример #8
0
        public ParseResultCollection Parse(int startIndex)
        {
            string                _text    = context.Text;
            ParserPattern         _pattern = context.Pattern;
            int                   k        = startIndex;
            char                  ch;
            StringBuilder         sb  = new StringBuilder(10);
            ParseResultCollection prc = new ParseResultCollection();

            if (_text[startIndex] == ' ' || _text[startIndex] == ' ')
            {
                return(prc);
            }

            int braceStartPos = -1;

            while (k < _text.Length)
            {
                ch = _text[k];
                if (!IsAllowedChar(ch, _pattern))
                {
                    break;
                }
                if (ch >= '0' && ch <= '9')
                {
                    ch = (char)(ch - '0' + '0');
                }
                if (ch == ' ')
                {
                    ch = ' ';
                }
                else if (ch == '(')
                {
                    ch = '(';
                }
                else if (ch == ')')
                {
                    ch = ')';
                }
                else if (ch == '-' || ch == '—')
                {
                    ch = '-';
                }

                if (ch == '(')
                {
                    braceStartPos = k;
                }
                else if (ch == ')')
                {
                    braceStartPos = -1;
                }
                sb.Append(ch);
                k++;
            }
            string allowedString = sb.ToString().TrimEnd();

            if (braceStartPos >= 0)
            {
                allowedString = allowedString.Substring(0, braceStartPos);
            }

            if (allowedString.Length < 3 || allowedString.Length == 4)
            {
                return(prc);
            }

            bool bNumberInBrace      = false;
            bool bCountryCodeStarted = false;
            bool bAreaCodeStarted    = false;
            bool bExtStarted         = false;
            int  i = 0;

            StringBuilder segment = new StringBuilder();
            StringBuilder whole   = new StringBuilder();

            PhoneNo phone = new PhoneNo();

            if (_pattern == ParserPattern.China)
            {
                while (i < allowedString.Length)
                {
                    ch = allowedString[i];
                    if (ch == '(')
                    {
                        bNumberInBrace      = true;
                        bCountryCodeStarted = false;
                        whole.Append(ch);
                    }
                    else if (NumeralUtil.IsArabicNumeral(ch))
                    {
                        if (segment.Length == 0 && !bAreaCodeStarted &&
                            phone.AreaCode == null && !bCountryCodeStarted)
                        {
                            bAreaCodeStarted = true;
                        }

                        segment.Append(ch);
                        whole.Append(ch);
                    }
                    else if (ch == ')' && bNumberInBrace)
                    {
                        if (bCountryCodeStarted)
                        {
                            if (segment.Length > 0)
                            {
                                phone.CountryCode = segment.ToString();
                            }
                            bCountryCodeStarted = false;
                        }
                        if (bAreaCodeStarted)
                        {
                            if (segment.Length > 0 && (segment[0] == '0' ? segment.Length <= 4 : segment.Length <= 3))  //城市代码以0开头,最多4个数字;不以0开头,三个数字
                            {
                                phone.AreaCode = segment.ToString();
                            }
                            bAreaCodeStarted = false;
                        }
                        whole.Append(ch);
                        segment        = new StringBuilder();
                        bNumberInBrace = false;
                    }
                    else if (ch == ' ')
                    {
                        if (bCountryCodeStarted)
                        {
                            if (segment.Length > 0)
                            {
                                phone.CountryCode = segment.ToString();
                            }
                            bCountryCodeStarted = false;
                        }
                        else if (bAreaCodeStarted)
                        {
                            if (segment.Length > 0)
                            {
                                phone.AreaCode = segment.ToString();
                            }
                            bAreaCodeStarted = false;
                        }
                        else if (segment.Length > 0)
                        {
                            AssignPhoneMain(segment, phone);
                        }
                        segment             = new StringBuilder();
                        bCountryCodeStarted = false;
                        whole.Append(ch);
                    }
                    else if (ch == '-' || ch == '#')
                    {
                        if (segment[0] == '0' && (segment.Length == 3 || segment.Length == 4))
                        {
                            phone.AreaCode = segment.ToString();
                        }
                        else if (segment.Length > 0)
                        {
                            AssignPhoneMain(segment, phone);
                            bExtStarted = true;
                        }
                        segment = new StringBuilder();
                        whole.Append(ch);
                    }
                    else if (ch == '+')
                    {
                        whole.Append(ch);
                        bCountryCodeStarted = true;
                    }
                    i++;
                }
                if (segment.Length > 0)
                {
                    AssignPhoneMain(segment, phone);
                    if (bExtStarted)
                    {
                        phone.Extension = segment.ToString();
                        bExtStarted     = false;
                    }
                }
            }
            else if (_pattern == ParserPattern.NorthAmerica)
            {
                while (i < allowedString.Length)
                {
                    ch = allowedString[i];

                    if (NumeralUtil.IsArabicNumeral(ch))
                    {
                        whole.Append(ch);
                        segment.Append(ch);
                    }
                    else if (ch == ' ')
                    {
                        whole.Append(ch);
                    }
                    else if (ch == '(')
                    {
                        bAreaCodeStarted = true;
                        whole.Append(ch);
                    }
                    else if (ch == ')')
                    {
                        if (bAreaCodeStarted)
                        {
                            if (segment.Length > 0)
                            {
                                phone.AreaCode = segment.ToString();
                            }
                            bAreaCodeStarted = false;
                        }
                        segment = new StringBuilder();
                        whole.Append(ch);
                    }
                    else if (ch == '-')
                    {
                        if (bCountryCodeStarted)
                        {
                            if (segment.Length > 0)
                            {
                                phone.CountryCode = segment.ToString();
                            }
                            bCountryCodeStarted = false;
                            bAreaCodeStarted    = true;
                        }
                        else if (bAreaCodeStarted)
                        {
                            if (segment.Length > 0)
                            {
                                phone.AreaCode = segment.ToString();
                            }
                            bAreaCodeStarted = false;
                        }
                        else if (segment.Length > 0)
                        {
                            AssignPhoneMain(segment, phone);
                        }
                        whole.Append(ch);
                        segment = new StringBuilder();
                    }
                    else if (ch == '+')
                    {
                        bCountryCodeStarted = true;
                        whole.Append(ch);
                    }
                    else if (ch == '.')
                    {
                        if (segment.ToString() != "ext")
                        {
                            break;
                        }

                        whole.Append("ext.");
                    }
                    else if (ch == 'e' || ch == 'x' || ch == 't')
                    {
                        segment.Append(ch);
                    }
                    i++;
                }
                if (segment.Length > 0)
                {
                    AssignPhoneMain(segment, phone);
                    if (bExtStarted)
                    {
                        phone.Extension = segment.ToString();
                        bExtStarted     = false;
                    }
                }
            }
            else
            {
                throw new NotImplementedException("Phone No. in " + _pattern.ToString() + " is not implemented in the parser.");
            }
            if (whole.Length > 0 && phone.Main != null)
            {
                prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.A_M, phone));
            }
            return(prc);
        }
Пример #9
0
        public ParseResultCollection Parse(int startIndex)
        {
            string text = NumeralUtil.ConvertChineseNumeral2Arabic(context.Text);
            ParseResultCollection prc = new ParseResultCollection();

            int           boundary      = Math.Min(maxDateTimeTextLength, text.Length - startIndex);
            string        temp          = text.Substring(startIndex, boundary);
            StringBuilder sbDateText    = new StringBuilder();
            StringBuilder sbPatternText = new StringBuilder();
            StringBuilder sbText        = new StringBuilder();
            int           strLen        = 0;
            int           i;
            char          prevCh     = ' ';
            bool          nonNumeric = false;

            for (i = 0; i < boundary; i++)
            {
                char ch = temp[i];

                if (NumeralUtil.IsArabicNumeral(ch))
                {
                    sbDateText.Append(ch);
                    sbText.Append(ch);
                    strLen++;
                }
                else if (ch == '大' || ch == '前' || ch == '昨' || ch == '明' || ch == '今' || ch == '后' || ch == '去')
                {
                }
                else if (ch == '周')
                {
                    if (prevCh == '上')
                    {
                        nonNumeric = true;
                        sbText.Append(prevCh);
                        sbText.Append(ch);
                        break;
                    }
                }
                else if (ch == '天')
                {
                    if (prevCh == '前' || prevCh == '昨' || prevCh == '明' || prevCh == '今' || prevCh == '后')
                    {
                        nonNumeric = true;
                        sbText.Append(prevCh);
                        sbText.Append(ch);
                        break;
                    }
                }
                else if (ch == '年')
                {
                    if (prevCh == '去' || prevCh == '前' || prevCh == '今' || prevCh == '后')
                    {
                        nonNumeric = true;
                        sbText.Append(prevCh);
                        sbText.Append(ch);
                        break;
                    }
                    if (strLen == 0)
                    {
                        return(prc);
                    }
                    sbDateText.Append(ch);
                    sbPatternText.Append(DateUtil.GeneratePatternText('y', strLen));
                    sbPatternText.Append(ch);
                    strLen = 0;
                    sbText.Append(ch);
                }
                else if (ch == '日')
                {
                    if (strLen == 0)
                    {
                        return(prc);
                    }

                    sbDateText.Append(ch);
                    sbPatternText.Append(DateUtil.GeneratePatternText('d', strLen));
                    sbPatternText.Append(ch);
                    strLen = 0;
                    sbText.Append(ch);
                }
                else if (ch == '月')
                {
                    if (strLen == 0)
                    {
                        return(prc);
                    }

                    sbDateText.Append(ch);
                    sbPatternText.Append(DateUtil.GeneratePatternText('M', strLen));
                    sbPatternText.Append(ch);
                    sbText.Append(ch);
                    strLen = 0;
                }
                else if (ch == '分')
                {
                    if (strLen == 0)
                    {
                        return(prc);
                    }

                    sbDateText.Append(ch);
                    sbPatternText.Append(DateUtil.GeneratePatternText('m', strLen));
                    sbPatternText.Append(ch);
                    sbText.Append(ch);
                    strLen = 0;
                }
                else if (ch == '秒')
                {
                    if (strLen == 0)
                    {
                        return(prc);
                    }

                    sbDateText.Append(ch);
                    sbPatternText.Append(DateUtil.GeneratePatternText('s', strLen));
                    sbPatternText.Append(ch);
                    sbText.Append(ch);
                    strLen = 0;
                }
                else if (ch == '点')
                {
                    if (strLen == 0)
                    {
                        return(prc);
                    }

                    sbDateText.Append(ch);
                    sbPatternText.Append(DateUtil.GeneratePatternText('h', strLen));
                    sbPatternText.Append(ch);
                    sbText.Append(ch);
                    strLen = 0;
                }
                else if (ch == ' ')
                {
                    sbText.Append(ch);
                    continue;
                }
                else
                {
                    break;
                }
                prevCh = ch;
            }
            if (sbText.Length > 0 && nonNumeric == true)
            {
                prc.Add(ParseResult.Create(sbText.ToString(), startIndex, POSType.D_T));
                return(prc);
            }
            if (sbDateText.Length == 0 || sbPatternText.Length == 0)
            {
                return(prc);
            }
            DateTime?dt = DateUtil.ParseDate(sbDateText.ToString(), sbPatternText.ToString());

            if (dt != null)
            {
                string result = sbText.ToString();
                prc.Add(ParseResult.Create(result, startIndex, POSType.D_T, dt));
            }
            return(prc);
        }