public ParseResultCollection Recognize(string text, ParserPattern pattern) { ParserContext context = new ParserContext(); context.Pattern = pattern; context.Text = text; ParseResultCollection result = new ParseResultCollection(); char[] chars = text.ToCharArray(); int i = 0; while (i < chars.Length) { char c = chars[i]; if (CharacterUtil.IsChinesePunctuation(c)) { i++; continue; } bool isFound = false; //扫描地名(优先于姓名,用于排除不正确人名) foreach (ConstructorInfo ci in parserConstructors) { IParser parser = ci.Invoke(new object[] { context }) as IParser; try { ParseResultCollection prc = parser.Parse(i); if (prc.Count > 0) { foreach (ParseResult pr in prc) { result.Add(pr); i += pr.Length; } isFound = true; break; } } catch (Exception ex) { Console.WriteLine(ex); } if (!isFound) { i++; } } } return(result); }
int MatchPunctation(string text, int startIndex, int maxlength) { for (int i = startIndex; i < startIndex + maxlength; i++) { if (i + 1 < text.Length && CharacterUtil.IsChinesePunctuation(text[i + 1])) { return(i + 1); } } return(-1); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); //TODO:外国人中文姓名处理(无姓) //3 找前缀 string prefix = MatchPrefix(_text, startIndex); int prefixlength = 0; if (prefix != null) { prefixlength = prefix.Length; } //1 扫描百家姓中的姓 //查单字姓 int currentPos = startIndex + prefixlength; string surname = MatchSurname(_text, currentPos); if (surname == null) { return(prc); } bool surnameInserted = false; bool givennameInserted = false; if (prefix != null && surname != null) { prc.Add(ParseResult.Create(prefix, startIndex, POSType.D_N)); //前缀 surnameInserted = true; prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); currentPos += surname.Length; } //2 如果姓后面是标点符号,直接认为不是人名 if (currentPos + 1 < _text.Length && CharacterUtil.IsChinesePunctuation(_text[currentPos + 1])) { return(prc); } //1.1用最大匹配搜索库中的完整人名,如果匹配且权重很高,直接认为是人名 //string fullname = MatchFullname(_text, startIndex); //if (fullname != null) //{ // prc.Add(ParseResult.Create(surname, startIndex, POSType.A_NR)); // prc.Add(ParseResult.Create(fullname.Substring(surname.Length), startIndex + surname.Length, POSType.A_NR)); // return prc; //} //3 找名字 //TODO:缩小名字的范围,否则容易造成匹配错误 //string givenname = MatchGivenname(_text, startIndex + surname.Length); //if (givenname != null) //{ // string suffix2 = MatchSuffix(_text, startIndex + surname.Length + givenname.Length, _siblingWordDB); // if (suffix != null && givenname.Length <= suffix.Length) // { // givenname = null; // } // else // { // suffix = suffix2; // } //} //4 如果后面是称谓,如先生、小姐、博士、医生,则认为是人名 int resultStartPos = -1; if (surname != null) { resultStartPos = currentPos + (surnameInserted?0:surname.Length); string suffix = MatchSuffix(_text, resultStartPos, out resultStartPos); if (suffix != null) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (resultStartPos > currentPos) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); prc.Add(ParseResult.Create(suffix, resultStartPos, POSType.D_N)); currentPos += givenname.Length + suffix.Length; givennameInserted = true; } else { prc.Add(ParseResult.Create(suffix, currentPos, POSType.D_N)); currentPos += suffix.Length; } return(prc); } } // 5 如果前面是动词、使动词,可认为是人名 if (surname != null) { resultStartPos = currentPos + (surnameInserted ? 0 : surname.Length); bool verbFound = MatchVerb(_text, resultStartPos, out resultStartPos); if (verbFound && resultStartPos > currentPos + (surnameInserted ? 0 : surname.Length)) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null) { //人名之后直接标点符号, 认为是人名 int punctuationPos = MatchPunctation(_text, currentPos + (surnameInserted ? 0 : surname.Length), 4); if (punctuationPos > 0) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, punctuationPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null && _text.Length - currentPos - surname.Length <= MaximumGivennameLength && _text.Length - currentPos - surname.Length > 0) //姓名之后没有字的情况 { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, _text.Length - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } return(prc); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseAddressLength, _text.Length - startIndex)); char[] chars = temp.ToCharArray(); //int lastStartPos = 0; StringBuilder sb = new StringBuilder(); StringBuilder whole = new StringBuilder(); ChineseAddress ca = new ChineseAddress(); int startpos = 0; //TODO: 通过字典找国家名 if (temp.StartsWith("中国")) { startpos = 2; ca.country = "中国"; whole.Append("中国"); } for (int i = startpos; i < chars.Length; i++) { char ch = chars[i]; if (ch == '市' || ch == '场') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string city = GetMaximumMatch(subStr, 0, 5); if (city != null) { ca.city = city; whole.Append(ca.city); sb = new StringBuilder(); } } else if (ch == '区') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string district = GetMaximumMatch(subStr, 0, 5); if (district != null) { if (!district.EndsWith("区")) { ca.city = district; whole.Append(ca.city); ca.district = subStr.Substring(ca.city.Length); whole.Append(ca.district); } else { //string district = NEParser.GetMaximumMatch(subStr, 0, 5, "district", _cityNames, null); ca.district = district; whole.Append(ca.district); } } else { ca.district = subStr; whole.Append(ca.district); } sb = new StringBuilder(); } else if (ch == '省') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string province = GetMaximumMatch(subStr, 0, 5); //省份 if (province != null) { ca.province = province; whole.Append(ca.province); sb = new StringBuilder(); } } else if (ch == '乡' || ch == '村' || ch == '县' || ch == '镇') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.county = sb.ToString(); whole.Append(ca.county); sb = new StringBuilder(); } else if (ch == '巷') { } else if (ch == '楼' || ch == '弄' || ch == '号' || ch == '室') { if (sb.Length == 0) { sb.Append(ch); continue; } string substr = NumeralUtil.ConvertChineseNumeral2Arabic(sb.ToString()); int x; sb.Append(ch); if (Int32.TryParse(substr, out x)) { if (ch == '楼') { ca.floor = sb.ToString(); } else if (ch == '弄') { ca.lane = sb.ToString(); } else if (ch == '号') { ca.no = sb.ToString(); } else if (ch == '室') { ca.room = sb.ToString(); } whole.Append(sb.ToString()); sb = new StringBuilder(); } } else if (ch == '道' || ch == '路' || ch == '街') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.street = sb.ToString(); whole.Append(ca.street); sb = new StringBuilder(); } else if (ch == '(' || ch == '(') { sb = new StringBuilder(); sb.Append(ch); } else if (ch == ')' || ch == ')') { sb.Append(ch); string extra1 = sb.ToString(); whole.Append(extra1); ca.extra = extra1; sb = new StringBuilder(); } else if (CharacterUtil.IsChinesePunctuation(ch) || (ch == ' ' || ch == ' ')) { break; } else if (ch == '大') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '桥' || nextchar == '厦') { string extra1 = sb.ToString() + "大" + nextchar; whole.Append(extra1); if (nextchar == '桥') { ca.extra += extra1; } else { ca.building = extra1; } i += 2 - 1; sb = new StringBuilder(); } else if (i + 2 < chars.Length && nextchar == '酒') { char nextchar2 = chars[i + 2]; if (nextchar2 == '店') { string extra1 = sb.ToString() + "大" + nextchar + nextchar2; string city = GetMaximumMatch(extra1, 0, 5); //城市或省份 if (city != null) { ca.city = city; whole.Append(ca.city); extra1 = extra1.Substring(ca.city.Length); } whole.Append(extra1); ca.building = extra1; i += 3 - 1; sb = new StringBuilder(); } } } } else if (ch == '餐') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '厅') { string extra1 = sb.ToString() + "餐" + nextchar; whole.Append(extra1); ca.extra += extra1; i += 2 - 1; sb = new StringBuilder(); } } } else { //if (sb.Length == 0) // lastStartPos = i; sb.Append(ch); string extra = sb.ToString(); if (extra.EndsWith("中心") || extra.EndsWith("酒店")) { string city = GetMaximumMatch(extra, 0, 5); //城市 if (city != null) { ca.city = city; extra = extra.Substring(city.Length); } ca.building = extra; whole.Append(extra); if (i + 2 < chars.Length && chars[i + 1] == '大' && chars[i + 2] == '厦') //处理 "中心大厦" { ca.building += "大厦"; whole.Append("大厦"); i += 2; sb = new StringBuilder(); continue; } sb = new StringBuilder(); } } } if (whole.Length > 0) { if (sb.Length > 0) { ca.extra = sb.ToString(); } prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.D_S, ca)); } return(prc); }