public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _format = context.Pattern; char ch; int i = startIndex; StringBuilder sb = new StringBuilder(6); ParseResultCollection prc = new ParseResultCollection(); ch = _text[i]; while (NumeralUtil.IsArabicNumeral(ch) || (ch >= '0' && ch <= '9') && i < _text.Length) { sb.Append(ch); ch = _text[++i]; } string source = sb.ToString(); if (_format == ParserPattern.China) { if (source.Length != 6) { return(prc); } } else if (_format == ParserPattern.NorthAmerica) { if (source.Length != 5) { return(prc); } } prc.Add(ParseResult.Create(source.ToString(), startIndex, POSType.A_M)); return(prc); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); int i = startIndex; char ch = _text[i]; while ((NumeralUtil.IsArabicNumeral(ch) || NumeralUtil.IsChineseNumeralChars(ch) || ch == '.') && i + 1 < _text.Length) { if (i == startIndex && (NumeralUtil.IsChineseGenDigit(ch) && ch != '十')) //首字出现进位符 { return(prc); } ch = _text[++i]; } if (i == startIndex) { return(prc); } int j = Math.Min(i, _text.Length); if (IsChineseQuantity(_text[j])) { prc.Add(ParseResult.Create(_text.Substring(startIndex, i - startIndex), startIndex, POSType.A_M)); prc.Add(ParseResult.Create(_text[i].ToString(), i, POSType.A_Q)); } return(prc); }
ParseResultCollection ParsePhoneNo(string text) { ParserContext context = new ParserContext(); context.Text = text; return(ParseResultCollection.InternalParse(text, new PhoneNoParser(context))); }
public void TestParseSingleOrgName() { IParser p = new OrgNameParser(GeneralParserTest.CreateParserContext("上海软星动力软件有限公司", dictAddr)); ParseResultCollection prc = p.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海软星动力软件有限公司", 0, POSType.A_NT); p = new OrgNameParser(GeneralParserTest.CreateParserContext("这里是上海互联网信息有限公司", dictAddr)); prc = p.Parse(3); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海互联网信息有限公司", 3, POSType.A_NT); p = new OrgNameParser(GeneralParserTest.CreateParserContext("鹿特丹美术学院", dictAddr)); prc = p.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "鹿特丹美术学院", 0, POSType.A_NT); p = new OrgNameParser(GeneralParserTest.CreateParserContext("鹿特丹美院", dictAddr)); prc = p.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "鹿特丹美院", 0, POSType.A_NT); p = new OrgNameParser(GeneralParserTest.CreateParserContext("先锋商泰(上海)有限公司", dictAddr)); prc = p.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "先锋商泰(上海)有限公司", 0, POSType.A_NT); }
public void TestConvertChineseText2Result() { ParseResultCollection prList = null; prList = ParseChineseDateTime("2005年至今"); Assert.AreEqual(1, prList.Count); ParseResult pr = prList[0]; Assert.AreEqual("2005年", pr.Text); Assert.AreEqual(5, pr.Length); Assert.AreEqual(0, pr.StartPos); Assert.AreEqual(POSType.D_T, pr.Type); Assert.AreEqual(new DateTime(2005, 1, 1), (DateTime)pr.Value); prList.Clear(); prList = ParseChineseDateTime("2005年7月至今"); Assert.AreEqual(1, prList.Count); pr = prList[0]; Assert.AreEqual("2005年7月", pr.Text); Assert.AreEqual(7, pr.Length); Assert.AreEqual(0, pr.StartPos); Assert.AreEqual(POSType.D_T, pr.Type); Assert.AreEqual(new DateTime(2005, 7, 1), (DateTime)pr.Value); prList.Clear(); prList = ParseChineseDateTime("2005年7月2日到2010年5月"); Assert.AreEqual(2, prList.Count); ParseResult pr0 = prList[0]; Assert.AreEqual("2005年7月2日", pr0.Text); Assert.AreEqual(9, pr0.Length); Assert.AreEqual(0, pr0.StartPos); Assert.AreEqual(POSType.D_T, pr0.Type); Assert.AreEqual((DateTime)pr0.Value, new DateTime(2005, 7, 2)); ParseResult pr1 = prList[1]; Assert.AreEqual("2010年5月", pr1.Text); Assert.AreEqual(7, pr1.Length); Assert.AreEqual(10, pr1.StartPos); Assert.AreEqual(POSType.D_T, pr1.Type); Assert.AreEqual((DateTime)pr1.Value, new DateTime(2010, 5, 1)); prList.Clear(); prList = ParseChineseDateTime("2005年7月 - 2010 年 05月"); Assert.AreEqual(2, prList.Count); pr0 = prList[0]; Assert.AreEqual("2005年7月", pr0.Text); Assert.AreEqual(7, pr0.Length); Assert.AreEqual(0, pr0.StartPos); Assert.AreEqual(POSType.D_T, pr0.Type); Assert.AreEqual((DateTime)pr0.Value, new DateTime(2005, 7, 1)); pr1 = prList[1]; Assert.AreEqual("2010 年 05月", pr1.Text); Assert.AreEqual(10, pr1.Length); Assert.AreEqual(9, pr1.StartPos); Assert.AreEqual(POSType.D_T, pr1.Type); Assert.AreEqual((DateTime)pr1.Value, new DateTime(2010, 5, 1)); }
ParseResultCollection ParseChineseQuantity(string text) { ParserContext context = new ParserContext(); context.Text = text; return(ParseResultCollection.InternalParse(text, new SimChineseQuantityParser(context))); }
public void TestInvalidQuantityCase() { IParser p = new SimChineseQuantityParser(TestUtility.CreateParserContext("兆棵树")); ParseResultCollection prc = p.Parse(0); Assert.AreEqual(0, prc.Count); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _format = context.Pattern; char ch; int i=startIndex; StringBuilder sb = new StringBuilder(6); ParseResultCollection prc = new ParseResultCollection(); ch = _text[i]; while (NumeralUtil.IsArabicNumeral(ch) || (ch >= '0' && ch <= '9') && i < _text.Length) { sb.Append(ch); ch = _text[++i]; } string source = sb.ToString(); if (_format == ParserPattern.China) { if (source.Length !=6) return prc; } else if (_format == ParserPattern.NorthAmerica) { if (source.Length != 5) return prc; } prc.Add(ParseResult.Create(source.ToString(), startIndex, POSType.A_M)); return prc; }
public void TestParseSingleAddressWithoutTerminator() { IParser parser = new ChineseAddressParser(CreateParserContext("上海市黄浦区内环南浦大桥立交桥")); ParseResultCollection prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海市黄浦区内环南浦大桥", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, null, "上海市", "黄浦区", null, null, null, null, null, "内环南浦大桥", null); //TODO: 立交桥 parser = new ChineseAddressParser(CreateParserContext("地址:杭州市江干区九堡九环路60号(厂房)")); prc = parser.Parse(3); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "杭州市江干区九堡九环路60号(厂房)", 3, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, null, "杭州市", "江干区", "九堡九环路", "60号", null, null, null, "(厂房)", null); parser = new ChineseAddressParser(CreateParserContext("杭州市江干区九堡九环路60号(厂房) 邮编:")); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "杭州市江干区九堡九环路60号(厂房)", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, null, "杭州市", "江干区", "九堡九环路", "60号", null, null, null, "(厂房)", null); parser = new ChineseAddressParser(CreateParserContext("地址:杭州红楼大酒店二楼融府中餐厅")); prc = parser.Parse(3); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "杭州红楼大酒店二楼融府中餐厅", 3, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, null, "杭州", null, null, null, "二楼", null, "红楼大酒店", "融府中餐厅", null); parser = new ChineseAddressParser(CreateParserContext("浦东新区红楼大酒店三楼")); prc = parser.Parse(0); GeneralParserTest.AssertParseResult(prc[0], "浦东新区红楼大酒店三楼", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, null, null, "浦东新区", null, null, "三楼", null, "红楼大酒店", null, null); }
ParseResultCollection ParseChineseOrgName(string text) { ParserContext context = new ParserContext(dictAddr); context.Text = text; return(ParseResultCollection.InternalParse(text, new OrgNameParser(context))); }
public void TestParseNoneAddressText() { IParser parser = new ChineseAddressParser(CreateParserContext("这是一个测试")); ParseResultCollection prc = parser.Parse(0); Assert.AreEqual(0, prc.Count); }
public void TestParseSingleSpecialAddress() { IParser parser = new ChineseAddressParser(CreateParserContext("地址:杭州市江干区九堡九环路60号一号厂房")); ParseResultCollection prc = parser.Parse(3); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "杭州市江干区九堡九环路60号一号", 3, POSType.D_S); //TODO: 厂房无法识别 }
public void TestParseSingleMobile() { PhoneNoParser parser2 = new PhoneNoParser(TestUtility.CreateParserContext("+86 13482572088", ParserPattern.China)); ParseResultCollection prc2 = parser2.Parse(0); Assert.AreEqual(1, prc2.Count); TestUtility.AssertParseResult(prc2[0], "+86 13482572088", 0, POSType.A_M); AssertPhoneValue(prc2[0], "86", null, "13482572088", null, true); }
public ParseResultCollection Recognize(string text, ParserPattern pattern) { ParserContext context = new ParserContext(); context.Pattern = pattern; context.Text = text; ParseResultCollection result = new ParseResultCollection(); char[] chars = text.ToCharArray(); int i = 0; while (i < chars.Length) { char c = chars[i]; if (CharacterUtil.IsChinesePunctuation(c)) { i++; continue; } bool isFound = false; //扫描地名(优先于姓名,用于排除不正确人名) foreach (ConstructorInfo ci in parserConstructors) { IParser parser = ci.Invoke(new object[] { context }) as IParser; try { ParseResultCollection prc = parser.Parse(i); if (prc.Count > 0) { foreach (ParseResult pr in prc) { result.Add(pr); i += pr.Length; } isFound = true; break; } } catch (Exception ex) { Console.WriteLine(ex); } if (!isFound) { i++; } } } return(result); }
public void TestNotZhangSanCase() { ParseResultCollection prc = ParseChineseQuantity("李三买了一张三角桌子"); Assert.AreEqual(4, prc.Count); TestUtility.AssertParseResult(prc[0], "一", 4, POSType.A_M); TestUtility.AssertParseResult(prc[1], "张", 5, POSType.A_Q); TestUtility.AssertParseResult(prc[2], "三", 6, POSType.A_M); //TODO: 是否要移除 三角 TestUtility.AssertParseResult(prc[3], "角", 7, POSType.A_Q); }
public ParseResultCollection Recognize(string text, ParserPattern pattern) { ParserContext context = new ParserContext(); context.Pattern = pattern; context.Text = text; ParseResultCollection result = new ParseResultCollection(); char[] chars = text.ToCharArray(); int i = 0; while (i < chars.Length) { char c = chars[i]; if (CharacterUtil.IsChinesePunctuation(c)) { i++; continue; } bool isFound = false; //扫描地名(优先于姓名,用于排除不正确人名) foreach (ConstructorInfo ci in parserConstructors) { IParser parser = ci.Invoke(new object[] { context }) as IParser; try { ParseResultCollection prc = parser.Parse(i); if (prc.Count > 0) { foreach (ParseResult pr in prc) { result.Add(pr); i += pr.Length; } isFound = true; break; } } catch (Exception ex) { Console.WriteLine(ex); } if (!isFound) { i++; } } } return result; }
public void TestParseFullSentenceForPublisherName() { ParseResultCollection prc = ParseChineseOrgName("《软件测试方法与技术实践指南Java EE版》已经由清华大学出版社出版"); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "清华大学出版社", 26, POSType.A_NT); prc = ParseChineseOrgName("作为法律出版社新书、重点书发布平台,我们致力于为读者提供优质的信息服务。"); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "法律出版社", 2, POSType.A_NT); }
public void TestParseNotZhangSanCase() { IParser p = new NameParser(GeneralParserTest.CreateParserContext("李三买了一张三角桌子", dictServer)); ParseResultCollection prc = p.Parse(0); Assert.AreEqual(2, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "李", 0, POSType.A_NR); GeneralParserTest.AssertParseResult(prc[1], "三", 1, POSType.A_NR); prc = p.Parse(5); Assert.AreEqual(0, prc.Count); }
public void TestParsePhones() { string text = "机关党工委 86780445 区政研室 86780455 老干部局 86780474 团区委 86780515 妇 联 86780524"; ParseResultCollection prc = ParsePhoneNo(text); Assert.AreEqual(5, prc.Count); TestUtility.AssertParseResult(prc[0], "86780445", 7, POSType.A_M); AssertPhoneValue(prc[0], null, null, "86780445", null); TestUtility.AssertParseResult(prc[1], "86780455", 22, POSType.A_M); TestUtility.AssertParseResult(prc[2], "86780474", 37, POSType.A_M); TestUtility.AssertParseResult(prc[3], "86780515", 52, POSType.A_M); TestUtility.AssertParseResult(prc[4], "86780524", 67, POSType.A_M); }
public void TestParseSinglePostalCode() { IParser p = new PostalCodeParser(GeneralParserTest.CreateParserContext("200135,哈哈", ParserPattern.China)); ParseResultCollection prc = p.Parse(0); Assert.AreEqual(1, prc.Count); AssertParseResult(prc[0], "200135", 0, POSType.A_M); p = new PostalCodeParser(GeneralParserTest.CreateParserContext("21201 Baltimore Maryland(MD)", ParserPattern.NorthAmerica)); prc = p.Parse(0); Assert.AreEqual(1, prc.Count); AssertParseResult(prc[0], "21201", 0, POSType.A_M); }
public void TestParseSingleQuantity() { IParser p = new SimChineseQuantityParser(TestUtility.CreateParserContext("一棵树通常有二十五根树杈。")); ParseResultCollection prc = p.Parse(0); Assert.AreEqual(2, prc.Count); TestUtility.AssertParseResult(prc[0], "一", 0, POSType.A_M); TestUtility.AssertParseResult(prc[1], "棵", 1, POSType.A_Q); prc = p.Parse(6); Assert.AreEqual(2, prc.Count); TestUtility.AssertParseResult(prc[0], "二十五", 6, POSType.A_M); TestUtility.AssertParseResult(prc[1], "根", 9, POSType.A_Q); }
public void TestParsePlaceName() { IParser p = new PlaceNameParser(GeneralParserTest.CreateParserContext("你好,我在上海,他在北京。", dictAddr)); ParseResultCollection prc = p.Parse(0); Assert.AreEqual(0, prc.Count); prc = p.Parse(5); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海", 5, POSType.A_NS); prc = p.Parse(10); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "北京", 10, POSType.A_NS); }
public void TestParseFullSentenceForOrgName() { ParseResultCollection prc = ParseChineseOrgName("郑荣科 07行政管理专业 现任职于上海江正营销策划有限公司"); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海江正营销策划有限公司", 17, POSType.A_NT); prc = ParseChineseOrgName("李承龙 08民航 现任职于中国南方国际航空。"); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "中国南方国际航空", 13, POSType.A_NT); prc = ParseChineseOrgName("李林玲(左) 06旅游管理班 现任职于上海中福大酒店。"); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海中福大酒店", 19, POSType.A_NT); }
public void TestParseSingleAddressWithDuplicateCity() { IParser parser = new ChineseAddressParser(CreateParserContext("上海市上海市黄浦区外马路1410号")); ParseResultCollection prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海市上海市黄浦区外马路1410号", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, "上海市", "黄浦区", "外马路", "1410号", null, null); parser = new ChineseAddressParser(CreateParserContext("上海上海市黄浦区外马路1410号")); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海黄浦区外马路1410号", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, "上海", "黄浦区", "外马路", "1410号", null, null); //出现两个上海时,以第一个为准 }
public void TestParseSingleNorthAmericanPhone() { PhoneNoParser parser = new PhoneNoParser(TestUtility.CreateParserContext("+1-415-555-2374", ParserPattern.NorthAmerica)); ParseResultCollection prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); TestUtility.AssertParseResult(prc[0], "+1-415-555-2374", 0, POSType.A_M); AssertPhoneValue(prc[0], "1", "415", "555-2374", null, true); parser = new PhoneNoParser(TestUtility.CreateParserContext("(800) 628-1058", ParserPattern.NorthAmerica)); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); TestUtility.AssertParseResult(prc[0], "(800) 628-1058", 0, POSType.A_M); AssertPhoneValue(prc[0], null, "800", "628-1058", null, true); }
public void TestParseSingleName() { IParser p = new NameParser(GeneralParserTest.CreateParserContext("你好,我叫毛泽东,这位是朱德同志。", dictServer)); ParseResultCollection prc = p.Parse(5); Assert.AreEqual(2, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "毛", 5, POSType.A_NR); GeneralParserTest.AssertParseResult(prc[1], "泽东", 6, POSType.A_NR); prc = p.Parse(12); Assert.AreEqual(3, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "朱", 12, POSType.A_NR); GeneralParserTest.AssertParseResult(prc[1], "德", 13, POSType.A_NR); GeneralParserTest.AssertParseResult(prc[2], "同志", 14, POSType.D_N); }
public void TestParseSingleAddress() { IParser parser = new ChineseAddressParser(CreateParserContext("上海市黄浦区外马路1410号")); ParseResultCollection prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海市黄浦区外马路1410号", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, "上海市", "黄浦区", "外马路", "1410号", null, null); parser = new ChineseAddressParser(CreateParserContext("上海市黄浦区陆家浜路413弄5号702室(金南新苑商务楼)")); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "上海市黄浦区陆家浜路413弄5号702室(金南新苑商务楼)", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, null, "上海市", "黄浦区", "陆家浜路", "5号", null, "702室", null, null, "413弄"); parser = new ChineseAddressParser(CreateParserContext("中国上海市浦东新区银城中路68号时代金融中心大厦38楼")); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "中国上海市浦东新区银城中路68号时代金融中心大厦38楼", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, "中国", null, "上海市", "浦东新区", "银城中路", "68号", "38楼", null, "时代金融中心大厦"); parser = new ChineseAddressParser(CreateParserContext("杭州市体育场路453号14楼302室")); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "杭州市体育场路453号14楼302室", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, "杭州市", null, "体育场路", "453号", "14楼", "302室"); parser = new ChineseAddressParser(CreateParserContext("地址:杭州市江干区九堡九环路六十号")); prc = parser.Parse(3); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "杭州市江干区九堡九环路六十号", 3, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, "杭州市", "江干区", "九堡九环路", "六十号", null, null); parser = new ChineseAddressParser(CreateParserContext("杭州西湖区文二西路2号")); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "杭州西湖区文二西路2号", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, "杭州", "西湖区", "文二西路", "2号", null, null); parser = new ChineseAddressParser(CreateParserContext("长乐路460弄10号")); prc = parser.Parse(0); Assert.AreEqual(1, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "长乐路460弄10号", 0, POSType.D_S); AssertAddressValue((ChineseAddress)prc[0].Value, null, null, null, null, "长乐路", "10号", null, null, null, null, "460弄"); }
public void TestChineseNonNumericDateText() { ParseResultCollection prList = null; prList = ParseChineseDateTime("今天是个好日子。"); Assert.AreEqual(1, prList.Count); ParseResult pr = prList[0]; Assert.AreEqual("今天", pr.Text); Assert.AreEqual(2, pr.Length); Assert.AreEqual(0, pr.StartPos); Assert.AreEqual(POSType.D_T, pr.Type); DateTime now = new DateTime(DateTime.Now.Year, DateTime.Now.Month, DateTime.Now.Day); //Assert.AreEqual(now, (DateTime)pr.Value); prList = ParseChineseDateTime("你好,明天你会和昨天一样来吗?"); Assert.AreEqual(2, prList.Count); pr = prList[0]; Assert.AreEqual("明天", pr.Text); Assert.AreEqual(2, pr.Length); Assert.AreEqual(3, pr.StartPos); Assert.AreEqual(POSType.D_T, pr.Type); //Assert.AreEqual(now.AddDays(1), (DateTime)pr.Value); pr = prList[1]; Assert.AreEqual("昨天", pr.Text); Assert.AreEqual(2, pr.Length); Assert.AreEqual(8, pr.StartPos); Assert.AreEqual(POSType.D_T, pr.Type); //Assert.AreEqual(now.AddDays(-1), (DateTime)pr.Value); prList = ParseChineseDateTime("前天是2012年1月10日,晴"); Assert.AreEqual(2, prList.Count); pr = prList[0]; Assert.AreEqual("前天", pr.Text); Assert.AreEqual(2, pr.Length); Assert.AreEqual(0, pr.StartPos); Assert.AreEqual(POSType.D_T, pr.Type); pr = prList[1]; Assert.AreEqual("2012年1月10日", pr.Text); Assert.AreEqual(10, pr.Length); Assert.AreEqual(3, pr.StartPos); Assert.AreEqual(POSType.D_T, pr.Type); Assert.AreEqual(new DateTime(2012, 1, 10), (DateTime)pr.Value); }
public void TestParseNameWithSuffix() { IParser p = new NameParser(GeneralParserTest.CreateParserContext("王教授给我们授课", dictServer)); ParseResultCollection prc = p.Parse(0); Assert.AreEqual(2, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "王", 0, POSType.A_NR); GeneralParserTest.AssertParseResult(prc[1], "教授", 1, POSType.D_N); p = new NameParser(GeneralParserTest.CreateParserContext("王琪斌教授给我们授课", dictServer)); prc = p.Parse(0); Assert.AreEqual(3, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "王", 0, POSType.A_NR); GeneralParserTest.AssertParseResult(prc[1], "琪斌", 1, POSType.A_NR); GeneralParserTest.AssertParseResult(prc[2], "教授", 3, POSType.D_N); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string placeName = MatchPlaceName(_text, startIndex); if (placeName != null) { ParseResult pr = new ParseResult(); pr.Text = placeName; pr.StartPos = startIndex; pr.Type = POSType.A_NS; prc.Add(pr); return prc; } return prc; }
public void TestParseFullSentenceForPlaceNames() { ParseResultCollection prc = ParsePlaceName("沈阳的天气真好,如果哪天我能去沈阳玩就好了。呼和浩特那里咋样?"); Assert.AreEqual(3, prc.Count); GeneralParserTest.AssertParseResult(prc[0], "沈阳", 0, POSType.A_NS); GeneralParserTest.AssertParseResult(prc[1], "沈阳", 15, POSType.A_NS); GeneralParserTest.AssertParseResult(prc[2], "呼和浩特", 22, POSType.A_NS); ParseResultCollection prc2 = ParsePlaceName("传统“金砖四国”(BRIC)引用了巴西、俄罗斯、印度和中国的英文首字母。"); Assert.AreEqual(4, prc2.Count); GeneralParserTest.AssertParseResult(prc2[0], "巴西", 17, POSType.A_NS); GeneralParserTest.AssertParseResult(prc2[1], "俄罗斯", 20, POSType.A_NS); GeneralParserTest.AssertParseResult(prc2[2], "印度", 24, POSType.A_NS); GeneralParserTest.AssertParseResult(prc2[3], "中国", 27, POSType.A_NS); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string placeName = MatchPlaceName(_text, startIndex); if (placeName != null) { ParseResult pr = new ParseResult(); pr.Text = placeName; pr.StartPos = startIndex; pr.Type = POSType.A_NS; prc.Add(pr); return(prc); } return(prc); }
public void TestParseSingleIllegalPhone() { PhoneNoParser parser = new PhoneNoParser(TestUtility.CreateParserContext(" ", ParserPattern.China)); ParseResultCollection prc = parser.Parse(0); Assert.AreEqual(0, prc.Count); parser = new PhoneNoParser(TestUtility.CreateParserContext("611212341", ParserPattern.China)); prc = parser.Parse(0); Assert.AreEqual(0, prc.Count); parser = new PhoneNoParser(TestUtility.CreateParserContext("2", ParserPattern.China)); prc = parser.Parse(0); Assert.AreEqual(0, prc.Count); parser = new PhoneNoParser(TestUtility.CreateParserContext("23", ParserPattern.China)); prc = parser.Parse(0); Assert.AreEqual(0, prc.Count); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); int i=startIndex; char ch=_text[i]; while((NumeralUtil.IsArabicNumeral(ch)||NumeralUtil.IsChineseNumeralChars(ch)||ch=='.') && i+1<_text.Length ) { if (i == startIndex && (NumeralUtil.IsChineseGenDigit(ch) && ch != '十')) //首字出现进位符 return prc; ch = _text[++i]; } if (i == startIndex) return prc; int j = Math.Min(i, _text.Length); if(IsChineseQuantity(_text[j])) { prc.Add(ParseResult.Create(_text.Substring(startIndex,i-startIndex), startIndex, POSType.A_M)); prc.Add(ParseResult.Create(_text[i].ToString(),i,POSType.A_Q)); } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseAddressLength, _text.Length - startIndex)); char[] chars = temp.ToCharArray(); //int lastStartPos = 0; StringBuilder sb = new StringBuilder(); StringBuilder whole = new StringBuilder(); ChineseAddress ca=new ChineseAddress(); int startpos = 0; //TODO: 通过字典找国家名 if (temp.StartsWith("中国")) { startpos = 2; ca.country = "中国"; whole.Append("中国"); } for (int i = startpos; i < chars.Length; i++) { char ch = chars[i]; if (ch == '市'||ch=='场') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string city = GetMaximumMatch(subStr,0,5); if (city != null) { ca.city = city; whole.Append(ca.city); sb = new StringBuilder(); } } else if (ch == '区') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string district = GetMaximumMatch(subStr, 0, 5); if (district != null) { if (!district.EndsWith("区")) { ca.city = district; whole.Append(ca.city); ca.district = subStr.Substring(ca.city.Length); whole.Append(ca.district); } else { //string district = NEParser.GetMaximumMatch(subStr, 0, 5, "district", _cityNames, null); ca.district = district; whole.Append(ca.district); } } else { ca.district = subStr; whole.Append(ca.district); } sb = new StringBuilder(); } else if (ch == '省') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string province = GetMaximumMatch(subStr, 0, 5); //省份 if (province != null) { ca.province = province; whole.Append(ca.province); sb = new StringBuilder(); } } else if (ch == '乡' || ch == '村' || ch == '县' || ch == '镇') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.county = sb.ToString(); whole.Append(ca.county); sb = new StringBuilder(); } else if (ch == '巷') { } else if (ch == '楼'||ch == '弄'||ch == '号'||ch == '室') { if (sb.Length == 0) { sb.Append(ch); continue; } string substr = NumeralUtil.ConvertChineseNumeral2Arabic(sb.ToString()); int x; sb.Append(ch); if (Int32.TryParse(substr, out x)) { if (ch == '楼') ca.floor = sb.ToString(); else if (ch == '弄') ca.lane = sb.ToString(); else if (ch == '号') ca.no = sb.ToString(); else if (ch == '室') ca.room = sb.ToString(); whole.Append(sb.ToString()); sb = new StringBuilder(); } } else if (ch == '道' || ch == '路' || ch == '街') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.street = sb.ToString(); whole.Append(ca.street); sb = new StringBuilder(); } else if (ch == '(' || ch == '(') { sb = new StringBuilder(); sb.Append(ch); } else if (ch == ')' || ch == ')') { sb.Append(ch); string extra1 = sb.ToString(); whole.Append(extra1); ca.extra = extra1; sb = new StringBuilder(); } else if (CharacterUtil.IsChinesePunctuation(ch) || (ch == ' ' || ch == ' ')) { break; } else if (ch == '大') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '桥' || nextchar == '厦') { string extra1 = sb.ToString() + "大" + nextchar; whole.Append(extra1); if (nextchar == '桥') ca.extra += extra1; else ca.building = extra1; i += 2-1; sb = new StringBuilder(); } else if (i + 2 < chars.Length && nextchar == '酒') { char nextchar2 = chars[i + 2]; if (nextchar2 == '店') { string extra1 = sb.ToString() + "大" + nextchar+ nextchar2; string city = GetMaximumMatch(extra1, 0, 5); //城市或省份 if (city != null) { ca.city = city; whole.Append(ca.city); extra1 = extra1.Substring(ca.city.Length); } whole.Append(extra1); ca.building= extra1; i += 3-1; sb = new StringBuilder(); } } } } else if(ch=='餐') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '厅') { string extra1 = sb.ToString() + "餐" + nextchar; whole.Append(extra1); ca.extra += extra1; i += 2 - 1; sb = new StringBuilder(); } } } else { //if (sb.Length == 0) // lastStartPos = i; sb.Append(ch); string extra = sb.ToString(); if (extra.EndsWith("中心") || extra.EndsWith("酒店")) { string city = GetMaximumMatch(extra, 0, 5); //城市 if (city != null) { ca.city = city; extra = extra.Substring(city.Length); } ca.building = extra; whole.Append(extra); if (i + 2 < chars.Length && chars[i + 1] == '大' && chars[i + 2] == '厦') //处理 "中心大厦" { ca.building += "大厦"; whole.Append("大厦"); i += 2; sb = new StringBuilder(); continue; } sb = new StringBuilder(); } } } if ( whole.Length>0) { if(sb.Length>0) ca.extra= sb.ToString(); prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.D_S,ca)); } return prc; }
public ParseResultCollection Parse(int startIndex) { ParseResultCollection prc = new ParseResultCollection(); string input = context.Text.Substring(startIndex); if (context.Pattern != ParserPattern.NorthAmerica) { throw new InvalidOperationException("To use USAddressParser, Parser pattern must be NorthAmerica"); } if (!string.IsNullOrEmpty(input)) { var match = addressRegex.Match(input.ToUpperInvariant()); if (match.Success) { var extracted = GetApplicableFields(match); var addr = new Address(Normalize(extracted)); prc.Add(ParseResult.Create(addr.ToString(), startIndex, POSType.A_M, addr)); } } return prc; }
public ParseResultCollection Parse(int startIndex) { string text = NumeralUtil.ConvertChineseNumeral2Arabic(context.Text); ParseResultCollection prc = new ParseResultCollection(); int boundary = Math.Min(maxDateTimeTextLength, text.Length - startIndex); string temp = text.Substring(startIndex, boundary); StringBuilder sbDateText = new StringBuilder(); StringBuilder sbPatternText = new StringBuilder(); StringBuilder sbText = new StringBuilder(); int strLen = 0; int i; char prevCh=' '; bool nonNumeric = false; for (i = 0; i < boundary; i++) { char ch = temp[i]; if (NumeralUtil.IsArabicNumeral(ch)) { sbDateText.Append(ch); sbText.Append(ch); strLen++; } else if (ch == '大' || ch == '前' || ch == '昨' || ch == '明' || ch == '今' || ch == '后'|| ch == '去') { } else if (ch == '周') { if (prevCh == '上') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '天') { if (prevCh == '前' || prevCh == '昨' || prevCh == '明' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '年') { if (prevCh == '去' || prevCh == '前' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('y', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '日') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('d', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '月') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('M', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '分') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('m', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '秒') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('s', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '点') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('h', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == ' ') { sbText.Append(ch); continue; } else { break; } prevCh = ch; } if (sbText.Length >0 &&nonNumeric== true) { prc.Add(ParseResult.Create(sbText.ToString(),startIndex,POSType.D_T)); return prc; } if (sbDateText.Length == 0 || sbPatternText.Length == 0) { return prc; } DateTime? dt = DateUtil.ParseDate(sbDateText.ToString(), sbPatternText.ToString()); if (dt != null) { string result=sbText.ToString(); prc.Add(ParseResult.Create(result,startIndex,POSType.D_T,dt)); } return prc; }
public ParseResultCollection Parse(int startIndex) { char[] chars = context.Text.ToArray(); ParseResultCollection prc = new ParseResultCollection(); int i=startIndex; StringBuilder sb = new StringBuilder(); if (chars[i] == '这'||chars[i] == '那') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '些' || nextchar == '里'|| nextchar=='儿') sb.Append(nextchar); } } else if (chars[i] == '你'||chars[i] == '我' || chars[i] == '他' || chars[i] == '它' || chars[i] == '她'||chars[i]=='咱') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '们') sb.Append(nextchar); } } else if (chars[i] == '谁' || chars[i] == '朕' || chars[i] == '此' || chars[i] == '彼') { sb = new StringBuilder(); sb.Append(chars[i]); } else if (chars[i] == '大') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '家') sb.Append("大家"); } else if (chars[i] == '什') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '么') sb.Append("什么"); } else if (chars[i] == '自') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '己') sb.Append("自己"); } else if (chars[i] == '哪') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '里') sb.Append(nextchar); } } if (sb.Length > 0) { ParseResult pr = new ParseResult(); pr.StartPos = i; pr.Text = sb.ToString(); pr.Type = POSType.D_R; prc.Add(pr); sb = new StringBuilder(); } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); //TODO:外国人中文姓名处理(无姓) //3 找前缀 string prefix = MatchPrefix(_text, startIndex); int prefixlength = 0; if (prefix != null) { prefixlength = prefix.Length; } //1 扫描百家姓中的姓 //查单字姓 int currentPos = startIndex+prefixlength; string surname = MatchSurname(_text, currentPos); if (surname == null) { return prc; } bool surnameInserted = false; bool givennameInserted = false; if (prefix != null && surname != null) { prc.Add(ParseResult.Create(prefix, startIndex, POSType.D_N)); //前缀 surnameInserted = true; prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); currentPos += surname.Length; } //2 如果姓后面是标点符号,直接认为不是人名 if (currentPos + 1 < _text.Length && CharacterUtil.IsChinesePunctuation(_text[currentPos + 1])) { return prc; } //1.1用最大匹配搜索库中的完整人名,如果匹配且权重很高,直接认为是人名 //string fullname = MatchFullname(_text, startIndex); //if (fullname != null) //{ // prc.Add(ParseResult.Create(surname, startIndex, POSType.A_NR)); // prc.Add(ParseResult.Create(fullname.Substring(surname.Length), startIndex + surname.Length, POSType.A_NR)); // return prc; //} //3 找名字 //TODO:缩小名字的范围,否则容易造成匹配错误 //string givenname = MatchGivenname(_text, startIndex + surname.Length); //if (givenname != null) //{ // string suffix2 = MatchSuffix(_text, startIndex + surname.Length + givenname.Length, _siblingWordDB); // if (suffix != null && givenname.Length <= suffix.Length) // { // givenname = null; // } // else // { // suffix = suffix2; // } //} //4 如果后面是称谓,如先生、小姐、博士、医生,则认为是人名 int resultStartPos = -1; if (surname != null) { resultStartPos = currentPos + (surnameInserted?0:surname.Length); string suffix = MatchSuffix(_text, resultStartPos, out resultStartPos); if (suffix != null) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (resultStartPos > currentPos) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); prc.Add(ParseResult.Create(suffix, resultStartPos, POSType.D_N)); currentPos += givenname.Length + suffix.Length; givennameInserted = true; } else { prc.Add(ParseResult.Create(suffix, currentPos, POSType.D_N)); currentPos += suffix.Length; } return prc; } } // 5 如果前面是动词、使动词,可认为是人名 if (surname != null) { resultStartPos = currentPos + (surnameInserted ? 0 : surname.Length); bool verbFound = MatchVerb(_text, resultStartPos, out resultStartPos); if (verbFound && resultStartPos > currentPos + (surnameInserted ? 0 : surname.Length)) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null) { //人名之后直接标点符号, 认为是人名 int punctuationPos = MatchPunctation(_text, currentPos + (surnameInserted ? 0 : surname.Length), 4); if (punctuationPos > 0) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, punctuationPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null && _text.Length - currentPos - surname.Length <= MaximumGivennameLength && _text.Length - currentPos - surname.Length>0) //姓名之后没有字的情况 { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, _text.Length - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _pattern = context.Pattern; int k = startIndex; char ch; StringBuilder sb = new StringBuilder(10); ParseResultCollection prc = new ParseResultCollection(); if (_text[startIndex] == ' ' || _text[startIndex] == ' ') return prc; int braceStartPos = -1; while (k < _text.Length) { ch = _text[k]; if (!IsAllowedChar(ch,_pattern)) break; if (ch >= '0' && ch <= '9') ch = (char)(ch - '0' + '0'); if (ch == ' ') { ch = ' '; } else if (ch == '(') ch = '('; else if (ch == ')') ch = ')'; else if (ch == '-' || ch == '—') ch = '-'; if (ch == '(') braceStartPos = k; else if (ch == ')') braceStartPos = -1; sb.Append(ch); k++; } string allowedString = sb.ToString().TrimEnd(); if (braceStartPos >= 0) { allowedString = allowedString.Substring(0, braceStartPos); } if (allowedString.Length<3||allowedString.Length==4) return prc; bool bNumberInBrace = false; bool bCountryCodeStarted = false; bool bAreaCodeStarted = false; bool bExtStarted = false; int i = 0; StringBuilder segment = new StringBuilder(); StringBuilder whole = new StringBuilder(); PhoneNo phone = new PhoneNo(); if (_pattern == ParserPattern.China) { while (i < allowedString.Length) { ch = allowedString[i]; if (ch == '(') { bNumberInBrace = true; bCountryCodeStarted = false; whole.Append(ch); } else if (NumeralUtil.IsArabicNumeral(ch)) { if (segment.Length == 0 && !bAreaCodeStarted && phone.AreaCode == null && !bCountryCodeStarted) bAreaCodeStarted = true; segment.Append(ch); whole.Append(ch); } else if (ch == ')' && bNumberInBrace) { if (bCountryCodeStarted) { if (segment.Length > 0) phone.CountryCode = segment.ToString(); bCountryCodeStarted = false; } if (bAreaCodeStarted) { if (segment.Length > 0 && (segment[0] == '0' ? segment.Length <= 4 : segment.Length <= 3)) //城市代码以0开头,最多4个数字;不以0开头,三个数字 phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } whole.Append(ch); segment = new StringBuilder(); bNumberInBrace = false; } else if (ch == ' ') { if (bCountryCodeStarted) { if (segment.Length > 0) phone.CountryCode = segment.ToString(); bCountryCodeStarted = false; } else if (bAreaCodeStarted) { if (segment.Length > 0) phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } segment = new StringBuilder(); bCountryCodeStarted = false; whole.Append(ch); } else if (ch == '-' || ch == '#') { if (segment[0] == '0' && (segment.Length == 3 || segment.Length == 4)) { phone.AreaCode = segment.ToString(); }else if (segment.Length > 0) { AssignPhoneMain(segment, phone); bExtStarted = true; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '+') { whole.Append(ch); bCountryCodeStarted = true; } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else if (_pattern == ParserPattern.NorthAmerica) { while (i < allowedString.Length) { ch = allowedString[i]; if (NumeralUtil.IsArabicNumeral(ch)) { whole.Append(ch); segment.Append(ch); } else if (ch == ' ') { whole.Append(ch); } else if (ch == '(') { bAreaCodeStarted = true; whole.Append(ch); } else if (ch == ')') { if (bAreaCodeStarted) { if (segment.Length > 0) phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '-') { if (bCountryCodeStarted) { if (segment.Length > 0) phone.CountryCode = segment.ToString(); bCountryCodeStarted = false; bAreaCodeStarted = true; } else if (bAreaCodeStarted) { if (segment.Length > 0) phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } whole.Append(ch); segment = new StringBuilder(); } else if (ch == '+') { bCountryCodeStarted = true; whole.Append(ch); } else if (ch == '.') { if (segment.ToString() != "ext") break; whole.Append("ext."); } else if (ch == 'e' || ch == 'x' || ch == 't') { segment.Append(ch); } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else { throw new NotImplementedException("Phone No. in "+_pattern.ToString()+" is not implemented in the parser."); } if (whole.Length > 0 && phone.Main!=null) { prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.A_M, phone)); } return prc; }
//public static ParseResultCollection Parse(string text) //{ // return ParseResultCollection.InternalParse(text, new OrgNameParser(text)); //} public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseOrgNameLength,_text.Length-startIndex)); int pos = -1; string suffix = null; for (int i = 0; i < suffixList.Length; i++) { pos = temp.IndexOf(suffixList[i]); if(pos>0) { suffix = suffixList[i]; break; } } if (pos <= 0) //找不到后缀,直接返回 return prc; //寻找前置地名 string placeName = null; ParserContext context1 = this.context.Clone(); context1.Text = temp; IParser placeNameParser = new PlaceNameParser(context1); ParseResultCollection prc1 = placeNameParser.Parse(0); if (prc1.Count > 0) { placeName = (string)prc1[0].Text; } if (placeName!=null && pos -placeName.Length < maxMiddlePartLength) { prc.Add(ParseResult.Create(temp.Substring(0, pos + suffix.Length), startIndex, POSType.A_NT)); } else if (context.Text.IndexOf("(")>0) { int bracePos = context.Text.IndexOf("("); IParser placeNameParser2 = new PlaceNameParser(context); ParseResultCollection prc2 = placeNameParser2.Parse(bracePos+1); if (prc2.Count > 0) { placeName = (string)prc2[0].Text; prc.Add(ParseResult.Create(temp.Substring(0, pos + suffix.Length), startIndex, POSType.A_NT)); } } else { //没有找到地名 string orgName = MatchOrgName(temp, 0); if (orgName != null) { prc.Add(ParseResult.Create(orgName, startIndex, POSType.A_NT)); } else { //库中没有,使用谓词定位边界 } } return prc; /* * 《现代汉语词汇研究-中文信息处理》 确定规则 * a. 如果候选地名字符串前一词为地名指界词,且候选地名字串后一个词为地名特征词,则候选地名左右边界确定 * b. 如果候选地名字符串前一词为地名指界词,则候选地名左边界确定 * c. 如果候选地名字串后一个词为地名指界词,则候选地名右边界确定 * d. 如果两个候选地名字串存在并列关系, 其中一个候选地名被确定,则另一个候选地名也被确定 否定规则 * 称谓词否定规则:如果候选地名字串的前一词是人名称谓词,且候选地名字串中没有地名特征词,否定该地名字串。 * 指界词否定规则:如果候选地名字串的后一词为人名指界词,且候选地名字串中没有地名特征词,否定该地名字串。 * 并列否定规则:如果两个候选地名字串存在并列关系,其中一个候选地名被否定,另一个候选地名也被否定。 * 其他物体类否定规则:如果候选地名字符串的后一词为其他物体类特征词,否定该地名字串。如红塔山香烟 * 非单字词否定规则:如果候选地名字串的前一词不是单字词,或候选地名字串的后一词不是单字词,则否定候选地名 边界修正规则 * 称谓词与特征词修正规则:如果候选地名字串的前一词为人名称谓词且候选地名字串中存在地名特征词,则修正地名的边界 */ }