public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _format = context.Pattern; char ch; int i = startIndex; StringBuilder sb = new StringBuilder(6); ParseResultCollection prc = new ParseResultCollection(); ch = _text[i]; while (NumeralUtil.IsArabicNumeral(ch) || (ch >= '0' && ch <= '9') && i < _text.Length) { sb.Append(ch); ch = _text[++i]; } string source = sb.ToString(); if (_format == ParserPattern.China) { if (source.Length != 6) { return(prc); } } else if (_format == ParserPattern.NorthAmerica) { if (source.Length != 5) { return(prc); } } prc.Add(ParseResult.Create(source.ToString(), startIndex, POSType.A_M)); return(prc); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); int i = startIndex; char ch = _text[i]; while ((NumeralUtil.IsArabicNumeral(ch) || NumeralUtil.IsChineseNumeralChars(ch) || ch == '.') && i + 1 < _text.Length) { if (i == startIndex && (NumeralUtil.IsChineseGenDigit(ch) && ch != '十')) //首字出现进位符 { return(prc); } ch = _text[++i]; } if (i == startIndex) { return(prc); } int j = Math.Min(i, _text.Length); if (IsChineseQuantity(_text[j])) { prc.Add(ParseResult.Create(_text.Substring(startIndex, i - startIndex), startIndex, POSType.A_M)); prc.Add(ParseResult.Create(_text[i].ToString(), i, POSType.A_Q)); } return(prc); }
public void TestIsEnglishNumeral() { string[] enNumText = new string[] { "one", "two", "three", "four", "five", "six", "steven", "eight", "nine", "ten", "billion", "million", "thousand" }; foreach (string str in enNumText) { Assert.IsTrue(NumeralUtil.IsEnglishNumeral(str), str); } }
public void TestIsChineseNumeral() { char[] chnGenText = new char[] { '零', '一', '二', '三', '四', '五', '六', '七', '八', '九' }; foreach (char ch in chnGenText) { Assert.IsTrue(NumeralUtil.IsChineseNumeral(ch)); } char[] chnRMBText = new char[] { '零', '壹', '贰', '叁', '肆', '伍', '陆', '染', '捌', '玖' }; foreach (char ch in chnRMBText) { Assert.IsTrue(NumeralUtil.IsChineseNumeral(ch)); } Assert.IsFalse(NumeralUtil.IsChineseNumeral('7')); }
public void TestConvertChineseNumeral2Arabic() { Assert.AreEqual( NumeralUtil.ConvertChineseNumeral2Arabic("你好,这里有二百三十五块钱,收好了,总共二千二百五十二块。二减五等于负三万零五十。"), "你好,这里有235块钱,收好了,总共2252块。2减5等于-30050。" ); Assert.AreEqual( NumeralUtil.ConvertChineseNumeral2Arabic("公元二零零五年四月"), "公元2005年4月" ); Assert.AreEqual( NumeralUtil.ConvertChineseNumeral2Arabic("公元前四五五年"), "公元前455年" ); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseAddressLength, _text.Length - startIndex)); char[] chars = temp.ToCharArray(); //int lastStartPos = 0; StringBuilder sb = new StringBuilder(); StringBuilder whole = new StringBuilder(); ChineseAddress ca = new ChineseAddress(); int startpos = 0; //TODO: 通过字典找国家名 if (temp.StartsWith("中国")) { startpos = 2; ca.country = "中国"; whole.Append("中国"); } for (int i = startpos; i < chars.Length; i++) { char ch = chars[i]; if (ch == '市' || ch == '场') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string city = GetMaximumMatch(subStr, 0, 5); if (city != null) { ca.city = city; whole.Append(ca.city); sb = new StringBuilder(); } } else if (ch == '区') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string district = GetMaximumMatch(subStr, 0, 5); if (district != null) { if (!district.EndsWith("区")) { ca.city = district; whole.Append(ca.city); ca.district = subStr.Substring(ca.city.Length); whole.Append(ca.district); } else { //string district = NEParser.GetMaximumMatch(subStr, 0, 5, "district", _cityNames, null); ca.district = district; whole.Append(ca.district); } } else { ca.district = subStr; whole.Append(ca.district); } sb = new StringBuilder(); } else if (ch == '省') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string province = GetMaximumMatch(subStr, 0, 5); //省份 if (province != null) { ca.province = province; whole.Append(ca.province); sb = new StringBuilder(); } } else if (ch == '乡' || ch == '村' || ch == '县' || ch == '镇') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.county = sb.ToString(); whole.Append(ca.county); sb = new StringBuilder(); } else if (ch == '巷') { } else if (ch == '楼' || ch == '弄' || ch == '号' || ch == '室') { if (sb.Length == 0) { sb.Append(ch); continue; } string substr = NumeralUtil.ConvertChineseNumeral2Arabic(sb.ToString()); int x; sb.Append(ch); if (Int32.TryParse(substr, out x)) { if (ch == '楼') { ca.floor = sb.ToString(); } else if (ch == '弄') { ca.lane = sb.ToString(); } else if (ch == '号') { ca.no = sb.ToString(); } else if (ch == '室') { ca.room = sb.ToString(); } whole.Append(sb.ToString()); sb = new StringBuilder(); } } else if (ch == '道' || ch == '路' || ch == '街') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.street = sb.ToString(); whole.Append(ca.street); sb = new StringBuilder(); } else if (ch == '(' || ch == '(') { sb = new StringBuilder(); sb.Append(ch); } else if (ch == ')' || ch == ')') { sb.Append(ch); string extra1 = sb.ToString(); whole.Append(extra1); ca.extra = extra1; sb = new StringBuilder(); } else if (CharacterUtil.IsChinesePunctuation(ch) || (ch == ' ' || ch == ' ')) { break; } else if (ch == '大') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '桥' || nextchar == '厦') { string extra1 = sb.ToString() + "大" + nextchar; whole.Append(extra1); if (nextchar == '桥') { ca.extra += extra1; } else { ca.building = extra1; } i += 2 - 1; sb = new StringBuilder(); } else if (i + 2 < chars.Length && nextchar == '酒') { char nextchar2 = chars[i + 2]; if (nextchar2 == '店') { string extra1 = sb.ToString() + "大" + nextchar + nextchar2; string city = GetMaximumMatch(extra1, 0, 5); //城市或省份 if (city != null) { ca.city = city; whole.Append(ca.city); extra1 = extra1.Substring(ca.city.Length); } whole.Append(extra1); ca.building = extra1; i += 3 - 1; sb = new StringBuilder(); } } } } else if (ch == '餐') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '厅') { string extra1 = sb.ToString() + "餐" + nextchar; whole.Append(extra1); ca.extra += extra1; i += 2 - 1; sb = new StringBuilder(); } } } else { //if (sb.Length == 0) // lastStartPos = i; sb.Append(ch); string extra = sb.ToString(); if (extra.EndsWith("中心") || extra.EndsWith("酒店")) { string city = GetMaximumMatch(extra, 0, 5); //城市 if (city != null) { ca.city = city; extra = extra.Substring(city.Length); } ca.building = extra; whole.Append(extra); if (i + 2 < chars.Length && chars[i + 1] == '大' && chars[i + 2] == '厦') //处理 "中心大厦" { ca.building += "大厦"; whole.Append("大厦"); i += 2; sb = new StringBuilder(); continue; } sb = new StringBuilder(); } } } if (whole.Length > 0) { if (sb.Length > 0) { ca.extra = sb.ToString(); } prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.D_S, ca)); } return(prc); }
bool IsAllowedChar(char ch, ParserPattern pattern) { if (pattern == ParserPattern.China) { if (ch >= '0' && ch <= '9') { return(true); } if (NumeralUtil.IsArabicNumeral(ch)) { return(true); } if (ch == '-' || ch == '-' || ch == '—') { return(true); } if (ch == '(' || ch == '(' || ch == ')' || ch == ')') { return(true); } if (ch == '+') { return(true); } if (ch == '#') { return(true); } if (ch == ' ' || ch == ' ') { return(true); } } else if (pattern == ParserPattern.NorthAmerica) { if (NumeralUtil.IsArabicNumeral(ch)) { return(true); } if (ch == '-' || ch == '-') { return(true); } if (ch == '(' || ch == ')') { return(true); } if (ch == '+') { return(true); } if (ch == ' ') { return(true); } if (ch == 'e' || ch == 'x' || ch == 't' || ch == '.') { return(true); } } return(false); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _pattern = context.Pattern; int k = startIndex; char ch; StringBuilder sb = new StringBuilder(10); ParseResultCollection prc = new ParseResultCollection(); if (_text[startIndex] == ' ' || _text[startIndex] == ' ') { return(prc); } int braceStartPos = -1; while (k < _text.Length) { ch = _text[k]; if (!IsAllowedChar(ch, _pattern)) { break; } if (ch >= '0' && ch <= '9') { ch = (char)(ch - '0' + '0'); } if (ch == ' ') { ch = ' '; } else if (ch == '(') { ch = '('; } else if (ch == ')') { ch = ')'; } else if (ch == '-' || ch == '—') { ch = '-'; } if (ch == '(') { braceStartPos = k; } else if (ch == ')') { braceStartPos = -1; } sb.Append(ch); k++; } string allowedString = sb.ToString().TrimEnd(); if (braceStartPos >= 0) { allowedString = allowedString.Substring(0, braceStartPos); } if (allowedString.Length < 3 || allowedString.Length == 4) { return(prc); } bool bNumberInBrace = false; bool bCountryCodeStarted = false; bool bAreaCodeStarted = false; bool bExtStarted = false; int i = 0; StringBuilder segment = new StringBuilder(); StringBuilder whole = new StringBuilder(); PhoneNo phone = new PhoneNo(); if (_pattern == ParserPattern.China) { while (i < allowedString.Length) { ch = allowedString[i]; if (ch == '(') { bNumberInBrace = true; bCountryCodeStarted = false; whole.Append(ch); } else if (NumeralUtil.IsArabicNumeral(ch)) { if (segment.Length == 0 && !bAreaCodeStarted && phone.AreaCode == null && !bCountryCodeStarted) { bAreaCodeStarted = true; } segment.Append(ch); whole.Append(ch); } else if (ch == ')' && bNumberInBrace) { if (bCountryCodeStarted) { if (segment.Length > 0) { phone.CountryCode = segment.ToString(); } bCountryCodeStarted = false; } if (bAreaCodeStarted) { if (segment.Length > 0 && (segment[0] == '0' ? segment.Length <= 4 : segment.Length <= 3)) //城市代码以0开头,最多4个数字;不以0开头,三个数字 { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } whole.Append(ch); segment = new StringBuilder(); bNumberInBrace = false; } else if (ch == ' ') { if (bCountryCodeStarted) { if (segment.Length > 0) { phone.CountryCode = segment.ToString(); } bCountryCodeStarted = false; } else if (bAreaCodeStarted) { if (segment.Length > 0) { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } segment = new StringBuilder(); bCountryCodeStarted = false; whole.Append(ch); } else if (ch == '-' || ch == '#') { if (segment[0] == '0' && (segment.Length == 3 || segment.Length == 4)) { phone.AreaCode = segment.ToString(); } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); bExtStarted = true; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '+') { whole.Append(ch); bCountryCodeStarted = true; } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else if (_pattern == ParserPattern.NorthAmerica) { while (i < allowedString.Length) { ch = allowedString[i]; if (NumeralUtil.IsArabicNumeral(ch)) { whole.Append(ch); segment.Append(ch); } else if (ch == ' ') { whole.Append(ch); } else if (ch == '(') { bAreaCodeStarted = true; whole.Append(ch); } else if (ch == ')') { if (bAreaCodeStarted) { if (segment.Length > 0) { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '-') { if (bCountryCodeStarted) { if (segment.Length > 0) { phone.CountryCode = segment.ToString(); } bCountryCodeStarted = false; bAreaCodeStarted = true; } else if (bAreaCodeStarted) { if (segment.Length > 0) { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } whole.Append(ch); segment = new StringBuilder(); } else if (ch == '+') { bCountryCodeStarted = true; whole.Append(ch); } else if (ch == '.') { if (segment.ToString() != "ext") { break; } whole.Append("ext."); } else if (ch == 'e' || ch == 'x' || ch == 't') { segment.Append(ch); } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else { throw new NotImplementedException("Phone No. in " + _pattern.ToString() + " is not implemented in the parser."); } if (whole.Length > 0 && phone.Main != null) { prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.A_M, phone)); } return(prc); }
public ParseResultCollection Parse(int startIndex) { string text = NumeralUtil.ConvertChineseNumeral2Arabic(context.Text); ParseResultCollection prc = new ParseResultCollection(); int boundary = Math.Min(maxDateTimeTextLength, text.Length - startIndex); string temp = text.Substring(startIndex, boundary); StringBuilder sbDateText = new StringBuilder(); StringBuilder sbPatternText = new StringBuilder(); StringBuilder sbText = new StringBuilder(); int strLen = 0; int i; char prevCh = ' '; bool nonNumeric = false; for (i = 0; i < boundary; i++) { char ch = temp[i]; if (NumeralUtil.IsArabicNumeral(ch)) { sbDateText.Append(ch); sbText.Append(ch); strLen++; } else if (ch == '大' || ch == '前' || ch == '昨' || ch == '明' || ch == '今' || ch == '后' || ch == '去') { } else if (ch == '周') { if (prevCh == '上') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '天') { if (prevCh == '前' || prevCh == '昨' || prevCh == '明' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '年') { if (prevCh == '去' || prevCh == '前' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('y', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '日') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('d', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '月') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('M', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '分') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('m', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '秒') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('s', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '点') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('h', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == ' ') { sbText.Append(ch); continue; } else { break; } prevCh = ch; } if (sbText.Length > 0 && nonNumeric == true) { prc.Add(ParseResult.Create(sbText.ToString(), startIndex, POSType.D_T)); return(prc); } if (sbDateText.Length == 0 || sbPatternText.Length == 0) { return(prc); } DateTime?dt = DateUtil.ParseDate(sbDateText.ToString(), sbPatternText.ToString()); if (dt != null) { string result = sbText.ToString(); prc.Add(ParseResult.Create(result, startIndex, POSType.D_T, dt)); } return(prc); }