public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); int i = startIndex; char ch = _text[i]; while ((NumeralUtil.IsArabicNumeral(ch) || NumeralUtil.IsChineseNumeralChars(ch) || ch == '.') && i + 1 < _text.Length) { if (i == startIndex && (NumeralUtil.IsChineseGenDigit(ch) && ch != '十')) //首字出现进位符 { return(prc); } ch = _text[++i]; } if (i == startIndex) { return(prc); } int j = Math.Min(i, _text.Length); if (IsChineseQuantity(_text[j])) { prc.Add(ParseResult.Create(_text.Substring(startIndex, i - startIndex), startIndex, POSType.A_M)); prc.Add(ParseResult.Create(_text[i].ToString(), i, POSType.A_Q)); } return(prc); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _format = context.Pattern; char ch; int i=startIndex; StringBuilder sb = new StringBuilder(6); ParseResultCollection prc = new ParseResultCollection(); ch = _text[i]; while (NumeralUtil.IsArabicNumeral(ch) || (ch >= '0' && ch <= '9') && i < _text.Length) { sb.Append(ch); ch = _text[++i]; } string source = sb.ToString(); if (_format == ParserPattern.China) { if (source.Length !=6) return prc; } else if (_format == ParserPattern.NorthAmerica) { if (source.Length != 5) return prc; } prc.Add(ParseResult.Create(source.ToString(), startIndex, POSType.A_M)); return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _format = context.Pattern; char ch; int i = startIndex; StringBuilder sb = new StringBuilder(6); ParseResultCollection prc = new ParseResultCollection(); ch = _text[i]; while (NumeralUtil.IsArabicNumeral(ch) || (ch >= '0' && ch <= '9') && i < _text.Length) { sb.Append(ch); ch = _text[++i]; } string source = sb.ToString(); if (_format == ParserPattern.China) { if (source.Length != 6) { return(prc); } } else if (_format == ParserPattern.NorthAmerica) { if (source.Length != 5) { return(prc); } } prc.Add(ParseResult.Create(source.ToString(), startIndex, POSType.A_M)); return(prc); }
public ParseResultCollection Recognize(string text, ParserPattern pattern) { ParserContext context = new ParserContext(); context.Pattern = pattern; context.Text = text; ParseResultCollection result = new ParseResultCollection(); char[] chars = text.ToCharArray(); int i = 0; while (i < chars.Length) { char c = chars[i]; if (CharacterUtil.IsChinesePunctuation(c)) { i++; continue; } bool isFound = false; //扫描地名(优先于姓名,用于排除不正确人名) foreach (ConstructorInfo ci in parserConstructors) { IParser parser = ci.Invoke(new object[] { context }) as IParser; try { ParseResultCollection prc = parser.Parse(i); if (prc.Count > 0) { foreach (ParseResult pr in prc) { result.Add(pr); i += pr.Length; } isFound = true; break; } } catch (Exception ex) { Console.WriteLine(ex); } if (!isFound) { i++; } } } return(result); }
public ParseResultCollection Recognize(string text, ParserPattern pattern) { ParserContext context = new ParserContext(); context.Pattern = pattern; context.Text = text; ParseResultCollection result = new ParseResultCollection(); char[] chars = text.ToCharArray(); int i = 0; while (i < chars.Length) { char c = chars[i]; if (CharacterUtil.IsChinesePunctuation(c)) { i++; continue; } bool isFound = false; //扫描地名(优先于姓名,用于排除不正确人名) foreach (ConstructorInfo ci in parserConstructors) { IParser parser = ci.Invoke(new object[] { context }) as IParser; try { ParseResultCollection prc = parser.Parse(i); if (prc.Count > 0) { foreach (ParseResult pr in prc) { result.Add(pr); i += pr.Length; } isFound = true; break; } } catch (Exception ex) { Console.WriteLine(ex); } if (!isFound) { i++; } } } return result; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string placeName = MatchPlaceName(_text, startIndex); if (placeName != null) { ParseResult pr = new ParseResult(); pr.Text = placeName; pr.StartPos = startIndex; pr.Type = POSType.A_NS; prc.Add(pr); return prc; } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string placeName = MatchPlaceName(_text, startIndex); if (placeName != null) { ParseResult pr = new ParseResult(); pr.Text = placeName; pr.StartPos = startIndex; pr.Type = POSType.A_NS; prc.Add(pr); return(prc); } return(prc); }
public ParseResultCollection Parse(int startIndex) { ParseResultCollection prc = new ParseResultCollection(); string input = context.Text.Substring(startIndex); if (context.Pattern != ParserPattern.NorthAmerica) { throw new InvalidOperationException("To use USAddressParser, Parser pattern must be NorthAmerica"); } if (!string.IsNullOrEmpty(input)) { var match = addressRegex.Match(input.ToUpperInvariant()); if (match.Success) { var extracted = GetApplicableFields(match); var addr = new Address(Normalize(extracted)); prc.Add(ParseResult.Create(addr.ToString(), startIndex, POSType.A_M, addr)); } } return(prc); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); int i=startIndex; char ch=_text[i]; while((NumeralUtil.IsArabicNumeral(ch)||NumeralUtil.IsChineseNumeralChars(ch)||ch=='.') && i+1<_text.Length ) { if (i == startIndex && (NumeralUtil.IsChineseGenDigit(ch) && ch != '十')) //首字出现进位符 return prc; ch = _text[++i]; } if (i == startIndex) return prc; int j = Math.Min(i, _text.Length); if(IsChineseQuantity(_text[j])) { prc.Add(ParseResult.Create(_text.Substring(startIndex,i-startIndex), startIndex, POSType.A_M)); prc.Add(ParseResult.Create(_text[i].ToString(),i,POSType.A_Q)); } return prc; }
public ParseResultCollection Parse(int startIndex) { ParseResultCollection prc = new ParseResultCollection(); string input = context.Text.Substring(startIndex); if (context.Pattern != ParserPattern.NorthAmerica) { throw new InvalidOperationException("To use USAddressParser, Parser pattern must be NorthAmerica"); } if (!string.IsNullOrEmpty(input)) { var match = addressRegex.Match(input.ToUpperInvariant()); if (match.Success) { var extracted = GetApplicableFields(match); var addr = new Address(Normalize(extracted)); prc.Add(ParseResult.Create(addr.ToString(), startIndex, POSType.A_M, addr)); } } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); //TODO:外国人中文姓名处理(无姓) //3 找前缀 string prefix = MatchPrefix(_text, startIndex); int prefixlength = 0; if (prefix != null) { prefixlength = prefix.Length; } //1 扫描百家姓中的姓 //查单字姓 int currentPos = startIndex+prefixlength; string surname = MatchSurname(_text, currentPos); if (surname == null) { return prc; } bool surnameInserted = false; bool givennameInserted = false; if (prefix != null && surname != null) { prc.Add(ParseResult.Create(prefix, startIndex, POSType.D_N)); //前缀 surnameInserted = true; prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); currentPos += surname.Length; } //2 如果姓后面是标点符号,直接认为不是人名 if (currentPos + 1 < _text.Length && CharacterUtil.IsChinesePunctuation(_text[currentPos + 1])) { return prc; } //1.1用最大匹配搜索库中的完整人名,如果匹配且权重很高,直接认为是人名 //string fullname = MatchFullname(_text, startIndex); //if (fullname != null) //{ // prc.Add(ParseResult.Create(surname, startIndex, POSType.A_NR)); // prc.Add(ParseResult.Create(fullname.Substring(surname.Length), startIndex + surname.Length, POSType.A_NR)); // return prc; //} //3 找名字 //TODO:缩小名字的范围,否则容易造成匹配错误 //string givenname = MatchGivenname(_text, startIndex + surname.Length); //if (givenname != null) //{ // string suffix2 = MatchSuffix(_text, startIndex + surname.Length + givenname.Length, _siblingWordDB); // if (suffix != null && givenname.Length <= suffix.Length) // { // givenname = null; // } // else // { // suffix = suffix2; // } //} //4 如果后面是称谓,如先生、小姐、博士、医生,则认为是人名 int resultStartPos = -1; if (surname != null) { resultStartPos = currentPos + (surnameInserted?0:surname.Length); string suffix = MatchSuffix(_text, resultStartPos, out resultStartPos); if (suffix != null) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (resultStartPos > currentPos) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); prc.Add(ParseResult.Create(suffix, resultStartPos, POSType.D_N)); currentPos += givenname.Length + suffix.Length; givennameInserted = true; } else { prc.Add(ParseResult.Create(suffix, currentPos, POSType.D_N)); currentPos += suffix.Length; } return prc; } } // 5 如果前面是动词、使动词,可认为是人名 if (surname != null) { resultStartPos = currentPos + (surnameInserted ? 0 : surname.Length); bool verbFound = MatchVerb(_text, resultStartPos, out resultStartPos); if (verbFound && resultStartPos > currentPos + (surnameInserted ? 0 : surname.Length)) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null) { //人名之后直接标点符号, 认为是人名 int punctuationPos = MatchPunctation(_text, currentPos + (surnameInserted ? 0 : surname.Length), 4); if (punctuationPos > 0) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, punctuationPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null && _text.Length - currentPos - surname.Length <= MaximumGivennameLength && _text.Length - currentPos - surname.Length>0) //姓名之后没有字的情况 { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, _text.Length - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _pattern = context.Pattern; int k = startIndex; char ch; StringBuilder sb = new StringBuilder(10); ParseResultCollection prc = new ParseResultCollection(); if (_text[startIndex] == ' ' || _text[startIndex] == ' ') return prc; int braceStartPos = -1; while (k < _text.Length) { ch = _text[k]; if (!IsAllowedChar(ch,_pattern)) break; if (ch >= '0' && ch <= '9') ch = (char)(ch - '0' + '0'); if (ch == ' ') { ch = ' '; } else if (ch == '(') ch = '('; else if (ch == ')') ch = ')'; else if (ch == '-' || ch == '—') ch = '-'; if (ch == '(') braceStartPos = k; else if (ch == ')') braceStartPos = -1; sb.Append(ch); k++; } string allowedString = sb.ToString().TrimEnd(); if (braceStartPos >= 0) { allowedString = allowedString.Substring(0, braceStartPos); } if (allowedString.Length<3||allowedString.Length==4) return prc; bool bNumberInBrace = false; bool bCountryCodeStarted = false; bool bAreaCodeStarted = false; bool bExtStarted = false; int i = 0; StringBuilder segment = new StringBuilder(); StringBuilder whole = new StringBuilder(); PhoneNo phone = new PhoneNo(); if (_pattern == ParserPattern.China) { while (i < allowedString.Length) { ch = allowedString[i]; if (ch == '(') { bNumberInBrace = true; bCountryCodeStarted = false; whole.Append(ch); } else if (NumeralUtil.IsArabicNumeral(ch)) { if (segment.Length == 0 && !bAreaCodeStarted && phone.AreaCode == null && !bCountryCodeStarted) bAreaCodeStarted = true; segment.Append(ch); whole.Append(ch); } else if (ch == ')' && bNumberInBrace) { if (bCountryCodeStarted) { if (segment.Length > 0) phone.CountryCode = segment.ToString(); bCountryCodeStarted = false; } if (bAreaCodeStarted) { if (segment.Length > 0 && (segment[0] == '0' ? segment.Length <= 4 : segment.Length <= 3)) //城市代码以0开头,最多4个数字;不以0开头,三个数字 phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } whole.Append(ch); segment = new StringBuilder(); bNumberInBrace = false; } else if (ch == ' ') { if (bCountryCodeStarted) { if (segment.Length > 0) phone.CountryCode = segment.ToString(); bCountryCodeStarted = false; } else if (bAreaCodeStarted) { if (segment.Length > 0) phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } segment = new StringBuilder(); bCountryCodeStarted = false; whole.Append(ch); } else if (ch == '-' || ch == '#') { if (segment[0] == '0' && (segment.Length == 3 || segment.Length == 4)) { phone.AreaCode = segment.ToString(); }else if (segment.Length > 0) { AssignPhoneMain(segment, phone); bExtStarted = true; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '+') { whole.Append(ch); bCountryCodeStarted = true; } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else if (_pattern == ParserPattern.NorthAmerica) { while (i < allowedString.Length) { ch = allowedString[i]; if (NumeralUtil.IsArabicNumeral(ch)) { whole.Append(ch); segment.Append(ch); } else if (ch == ' ') { whole.Append(ch); } else if (ch == '(') { bAreaCodeStarted = true; whole.Append(ch); } else if (ch == ')') { if (bAreaCodeStarted) { if (segment.Length > 0) phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '-') { if (bCountryCodeStarted) { if (segment.Length > 0) phone.CountryCode = segment.ToString(); bCountryCodeStarted = false; bAreaCodeStarted = true; } else if (bAreaCodeStarted) { if (segment.Length > 0) phone.AreaCode = segment.ToString(); bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } whole.Append(ch); segment = new StringBuilder(); } else if (ch == '+') { bCountryCodeStarted = true; whole.Append(ch); } else if (ch == '.') { if (segment.ToString() != "ext") break; whole.Append("ext."); } else if (ch == 'e' || ch == 'x' || ch == 't') { segment.Append(ch); } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else { throw new NotImplementedException("Phone No. in "+_pattern.ToString()+" is not implemented in the parser."); } if (whole.Length > 0 && phone.Main!=null) { prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.A_M, phone)); } return prc; }
public ParseResultCollection Parse(int startIndex) { string text = NumeralUtil.ConvertChineseNumeral2Arabic(context.Text); ParseResultCollection prc = new ParseResultCollection(); int boundary = Math.Min(maxDateTimeTextLength, text.Length - startIndex); string temp = text.Substring(startIndex, boundary); StringBuilder sbDateText = new StringBuilder(); StringBuilder sbPatternText = new StringBuilder(); StringBuilder sbText = new StringBuilder(); int strLen = 0; int i; char prevCh = ' '; bool nonNumeric = false; for (i = 0; i < boundary; i++) { char ch = temp[i]; if (NumeralUtil.IsArabicNumeral(ch)) { sbDateText.Append(ch); sbText.Append(ch); strLen++; } else if (ch == '大' || ch == '前' || ch == '昨' || ch == '明' || ch == '今' || ch == '后' || ch == '去') { } else if (ch == '周') { if (prevCh == '上') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '天') { if (prevCh == '前' || prevCh == '昨' || prevCh == '明' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '年') { if (prevCh == '去' || prevCh == '前' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('y', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '日') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('d', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '月') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('M', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '分') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('m', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '秒') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('s', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '点') { if (strLen == 0) { return(prc); } sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('h', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == ' ') { sbText.Append(ch); continue; } else { break; } prevCh = ch; } if (sbText.Length > 0 && nonNumeric == true) { prc.Add(ParseResult.Create(sbText.ToString(), startIndex, POSType.D_T)); return(prc); } if (sbDateText.Length == 0 || sbPatternText.Length == 0) { return(prc); } DateTime?dt = DateUtil.ParseDate(sbDateText.ToString(), sbPatternText.ToString()); if (dt != null) { string result = sbText.ToString(); prc.Add(ParseResult.Create(result, startIndex, POSType.D_T, dt)); } return(prc); }
//public static ParseResultCollection Parse(string text) //{ // return ParseResultCollection.InternalParse(text, new OrgNameParser(text)); //} public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseOrgNameLength,_text.Length-startIndex)); int pos = -1; string suffix = null; for (int i = 0; i < suffixList.Length; i++) { pos = temp.IndexOf(suffixList[i]); if(pos>0) { suffix = suffixList[i]; break; } } if (pos <= 0) //找不到后缀,直接返回 return prc; //寻找前置地名 string placeName = null; ParserContext context1 = this.context.Clone(); context1.Text = temp; IParser placeNameParser = new PlaceNameParser(context1); ParseResultCollection prc1 = placeNameParser.Parse(0); if (prc1.Count > 0) { placeName = (string)prc1[0].Text; } if (placeName!=null && pos -placeName.Length < maxMiddlePartLength) { prc.Add(ParseResult.Create(temp.Substring(0, pos + suffix.Length), startIndex, POSType.A_NT)); } else if (context.Text.IndexOf("(")>0) { int bracePos = context.Text.IndexOf("("); IParser placeNameParser2 = new PlaceNameParser(context); ParseResultCollection prc2 = placeNameParser2.Parse(bracePos+1); if (prc2.Count > 0) { placeName = (string)prc2[0].Text; prc.Add(ParseResult.Create(temp.Substring(0, pos + suffix.Length), startIndex, POSType.A_NT)); } } else { //没有找到地名 string orgName = MatchOrgName(temp, 0); if (orgName != null) { prc.Add(ParseResult.Create(orgName, startIndex, POSType.A_NT)); } else { //库中没有,使用谓词定位边界 } } return prc; /* * 《现代汉语词汇研究-中文信息处理》 确定规则 * a. 如果候选地名字符串前一词为地名指界词,且候选地名字串后一个词为地名特征词,则候选地名左右边界确定 * b. 如果候选地名字符串前一词为地名指界词,则候选地名左边界确定 * c. 如果候选地名字串后一个词为地名指界词,则候选地名右边界确定 * d. 如果两个候选地名字串存在并列关系, 其中一个候选地名被确定,则另一个候选地名也被确定 否定规则 * 称谓词否定规则:如果候选地名字串的前一词是人名称谓词,且候选地名字串中没有地名特征词,否定该地名字串。 * 指界词否定规则:如果候选地名字串的后一词为人名指界词,且候选地名字串中没有地名特征词,否定该地名字串。 * 并列否定规则:如果两个候选地名字串存在并列关系,其中一个候选地名被否定,另一个候选地名也被否定。 * 其他物体类否定规则:如果候选地名字符串的后一词为其他物体类特征词,否定该地名字串。如红塔山香烟 * 非单字词否定规则:如果候选地名字串的前一词不是单字词,或候选地名字串的后一词不是单字词,则否定候选地名 边界修正规则 * 称谓词与特征词修正规则:如果候选地名字串的前一词为人名称谓词且候选地名字串中存在地名特征词,则修正地名的边界 */ }
public ParseResultCollection Parse(int startIndex) { char[] chars = context.Text.ToArray(); ParseResultCollection prc = new ParseResultCollection(); int i = startIndex; StringBuilder sb = new StringBuilder(); if (chars[i] == '这' || chars[i] == '那') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '些' || nextchar == '里' || nextchar == '儿') { sb.Append(nextchar); } } } else if (chars[i] == '你' || chars[i] == '我' || chars[i] == '他' || chars[i] == '它' || chars[i] == '她' || chars[i] == '咱') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '们') { sb.Append(nextchar); } } } else if (chars[i] == '谁' || chars[i] == '朕' || chars[i] == '此' || chars[i] == '彼') { sb = new StringBuilder(); sb.Append(chars[i]); } else if (chars[i] == '大') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '家') { sb.Append("大家"); } } else if (chars[i] == '什') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '么') { sb.Append("什么"); } } else if (chars[i] == '自') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '己') { sb.Append("自己"); } } else if (chars[i] == '哪') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '里') { sb.Append(nextchar); } } } if (sb.Length > 0) { ParseResult pr = new ParseResult(); pr.StartPos = i; pr.Text = sb.ToString(); pr.Type = POSType.D_R; prc.Add(pr); sb = new StringBuilder(); } return(prc); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); //TODO:外国人中文姓名处理(无姓) //3 找前缀 string prefix = MatchPrefix(_text, startIndex); int prefixlength = 0; if (prefix != null) { prefixlength = prefix.Length; } //1 扫描百家姓中的姓 //查单字姓 int currentPos = startIndex + prefixlength; string surname = MatchSurname(_text, currentPos); if (surname == null) { return(prc); } bool surnameInserted = false; bool givennameInserted = false; if (prefix != null && surname != null) { prc.Add(ParseResult.Create(prefix, startIndex, POSType.D_N)); //前缀 surnameInserted = true; prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); currentPos += surname.Length; } //2 如果姓后面是标点符号,直接认为不是人名 if (currentPos + 1 < _text.Length && CharacterUtil.IsChinesePunctuation(_text[currentPos + 1])) { return(prc); } //1.1用最大匹配搜索库中的完整人名,如果匹配且权重很高,直接认为是人名 //string fullname = MatchFullname(_text, startIndex); //if (fullname != null) //{ // prc.Add(ParseResult.Create(surname, startIndex, POSType.A_NR)); // prc.Add(ParseResult.Create(fullname.Substring(surname.Length), startIndex + surname.Length, POSType.A_NR)); // return prc; //} //3 找名字 //TODO:缩小名字的范围,否则容易造成匹配错误 //string givenname = MatchGivenname(_text, startIndex + surname.Length); //if (givenname != null) //{ // string suffix2 = MatchSuffix(_text, startIndex + surname.Length + givenname.Length, _siblingWordDB); // if (suffix != null && givenname.Length <= suffix.Length) // { // givenname = null; // } // else // { // suffix = suffix2; // } //} //4 如果后面是称谓,如先生、小姐、博士、医生,则认为是人名 int resultStartPos = -1; if (surname != null) { resultStartPos = currentPos + (surnameInserted?0:surname.Length); string suffix = MatchSuffix(_text, resultStartPos, out resultStartPos); if (suffix != null) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (resultStartPos > currentPos) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); prc.Add(ParseResult.Create(suffix, resultStartPos, POSType.D_N)); currentPos += givenname.Length + suffix.Length; givennameInserted = true; } else { prc.Add(ParseResult.Create(suffix, currentPos, POSType.D_N)); currentPos += suffix.Length; } return(prc); } } // 5 如果前面是动词、使动词,可认为是人名 if (surname != null) { resultStartPos = currentPos + (surnameInserted ? 0 : surname.Length); bool verbFound = MatchVerb(_text, resultStartPos, out resultStartPos); if (verbFound && resultStartPos > currentPos + (surnameInserted ? 0 : surname.Length)) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, resultStartPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null) { //人名之后直接标点符号, 认为是人名 int punctuationPos = MatchPunctation(_text, currentPos + (surnameInserted ? 0 : surname.Length), 4); if (punctuationPos > 0) { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, punctuationPos - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } } if (surname != null && _text.Length - currentPos - surname.Length <= MaximumGivennameLength && _text.Length - currentPos - surname.Length > 0) //姓名之后没有字的情况 { if (!surnameInserted) { prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR)); surnameInserted = true; currentPos += surname.Length; } if (!givennameInserted) { string givenname = _text.Substring(currentPos, _text.Length - currentPos); prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR)); currentPos += givenname.Length; givennameInserted = true; } } return(prc); }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseAddressLength, _text.Length - startIndex)); char[] chars = temp.ToCharArray(); //int lastStartPos = 0; StringBuilder sb = new StringBuilder(); StringBuilder whole = new StringBuilder(); ChineseAddress ca=new ChineseAddress(); int startpos = 0; //TODO: 通过字典找国家名 if (temp.StartsWith("中国")) { startpos = 2; ca.country = "中国"; whole.Append("中国"); } for (int i = startpos; i < chars.Length; i++) { char ch = chars[i]; if (ch == '市'||ch=='场') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string city = GetMaximumMatch(subStr,0,5); if (city != null) { ca.city = city; whole.Append(ca.city); sb = new StringBuilder(); } } else if (ch == '区') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string district = GetMaximumMatch(subStr, 0, 5); if (district != null) { if (!district.EndsWith("区")) { ca.city = district; whole.Append(ca.city); ca.district = subStr.Substring(ca.city.Length); whole.Append(ca.district); } else { //string district = NEParser.GetMaximumMatch(subStr, 0, 5, "district", _cityNames, null); ca.district = district; whole.Append(ca.district); } } else { ca.district = subStr; whole.Append(ca.district); } sb = new StringBuilder(); } else if (ch == '省') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string province = GetMaximumMatch(subStr, 0, 5); //省份 if (province != null) { ca.province = province; whole.Append(ca.province); sb = new StringBuilder(); } } else if (ch == '乡' || ch == '村' || ch == '县' || ch == '镇') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.county = sb.ToString(); whole.Append(ca.county); sb = new StringBuilder(); } else if (ch == '巷') { } else if (ch == '楼'||ch == '弄'||ch == '号'||ch == '室') { if (sb.Length == 0) { sb.Append(ch); continue; } string substr = NumeralUtil.ConvertChineseNumeral2Arabic(sb.ToString()); int x; sb.Append(ch); if (Int32.TryParse(substr, out x)) { if (ch == '楼') ca.floor = sb.ToString(); else if (ch == '弄') ca.lane = sb.ToString(); else if (ch == '号') ca.no = sb.ToString(); else if (ch == '室') ca.room = sb.ToString(); whole.Append(sb.ToString()); sb = new StringBuilder(); } } else if (ch == '道' || ch == '路' || ch == '街') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.street = sb.ToString(); whole.Append(ca.street); sb = new StringBuilder(); } else if (ch == '(' || ch == '(') { sb = new StringBuilder(); sb.Append(ch); } else if (ch == ')' || ch == ')') { sb.Append(ch); string extra1 = sb.ToString(); whole.Append(extra1); ca.extra = extra1; sb = new StringBuilder(); } else if (CharacterUtil.IsChinesePunctuation(ch) || (ch == ' ' || ch == ' ')) { break; } else if (ch == '大') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '桥' || nextchar == '厦') { string extra1 = sb.ToString() + "大" + nextchar; whole.Append(extra1); if (nextchar == '桥') ca.extra += extra1; else ca.building = extra1; i += 2-1; sb = new StringBuilder(); } else if (i + 2 < chars.Length && nextchar == '酒') { char nextchar2 = chars[i + 2]; if (nextchar2 == '店') { string extra1 = sb.ToString() + "大" + nextchar+ nextchar2; string city = GetMaximumMatch(extra1, 0, 5); //城市或省份 if (city != null) { ca.city = city; whole.Append(ca.city); extra1 = extra1.Substring(ca.city.Length); } whole.Append(extra1); ca.building= extra1; i += 3-1; sb = new StringBuilder(); } } } } else if(ch=='餐') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '厅') { string extra1 = sb.ToString() + "餐" + nextchar; whole.Append(extra1); ca.extra += extra1; i += 2 - 1; sb = new StringBuilder(); } } } else { //if (sb.Length == 0) // lastStartPos = i; sb.Append(ch); string extra = sb.ToString(); if (extra.EndsWith("中心") || extra.EndsWith("酒店")) { string city = GetMaximumMatch(extra, 0, 5); //城市 if (city != null) { ca.city = city; extra = extra.Substring(city.Length); } ca.building = extra; whole.Append(extra); if (i + 2 < chars.Length && chars[i + 1] == '大' && chars[i + 2] == '厦') //处理 "中心大厦" { ca.building += "大厦"; whole.Append("大厦"); i += 2; sb = new StringBuilder(); continue; } sb = new StringBuilder(); } } } if ( whole.Length>0) { if(sb.Length>0) ca.extra= sb.ToString(); prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.D_S,ca)); } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParserPattern _pattern = context.Pattern; int k = startIndex; char ch; StringBuilder sb = new StringBuilder(10); ParseResultCollection prc = new ParseResultCollection(); if (_text[startIndex] == ' ' || _text[startIndex] == ' ') { return(prc); } int braceStartPos = -1; while (k < _text.Length) { ch = _text[k]; if (!IsAllowedChar(ch, _pattern)) { break; } if (ch >= '0' && ch <= '9') { ch = (char)(ch - '0' + '0'); } if (ch == ' ') { ch = ' '; } else if (ch == '(') { ch = '('; } else if (ch == ')') { ch = ')'; } else if (ch == '-' || ch == '—') { ch = '-'; } if (ch == '(') { braceStartPos = k; } else if (ch == ')') { braceStartPos = -1; } sb.Append(ch); k++; } string allowedString = sb.ToString().TrimEnd(); if (braceStartPos >= 0) { allowedString = allowedString.Substring(0, braceStartPos); } if (allowedString.Length < 3 || allowedString.Length == 4) { return(prc); } bool bNumberInBrace = false; bool bCountryCodeStarted = false; bool bAreaCodeStarted = false; bool bExtStarted = false; int i = 0; StringBuilder segment = new StringBuilder(); StringBuilder whole = new StringBuilder(); PhoneNo phone = new PhoneNo(); if (_pattern == ParserPattern.China) { while (i < allowedString.Length) { ch = allowedString[i]; if (ch == '(') { bNumberInBrace = true; bCountryCodeStarted = false; whole.Append(ch); } else if (NumeralUtil.IsArabicNumeral(ch)) { if (segment.Length == 0 && !bAreaCodeStarted && phone.AreaCode == null && !bCountryCodeStarted) { bAreaCodeStarted = true; } segment.Append(ch); whole.Append(ch); } else if (ch == ')' && bNumberInBrace) { if (bCountryCodeStarted) { if (segment.Length > 0) { phone.CountryCode = segment.ToString(); } bCountryCodeStarted = false; } if (bAreaCodeStarted) { if (segment.Length > 0 && (segment[0] == '0' ? segment.Length <= 4 : segment.Length <= 3)) //城市代码以0开头,最多4个数字;不以0开头,三个数字 { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } whole.Append(ch); segment = new StringBuilder(); bNumberInBrace = false; } else if (ch == ' ') { if (bCountryCodeStarted) { if (segment.Length > 0) { phone.CountryCode = segment.ToString(); } bCountryCodeStarted = false; } else if (bAreaCodeStarted) { if (segment.Length > 0) { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } segment = new StringBuilder(); bCountryCodeStarted = false; whole.Append(ch); } else if (ch == '-' || ch == '#') { if (segment[0] == '0' && (segment.Length == 3 || segment.Length == 4)) { phone.AreaCode = segment.ToString(); } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); bExtStarted = true; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '+') { whole.Append(ch); bCountryCodeStarted = true; } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else if (_pattern == ParserPattern.NorthAmerica) { while (i < allowedString.Length) { ch = allowedString[i]; if (NumeralUtil.IsArabicNumeral(ch)) { whole.Append(ch); segment.Append(ch); } else if (ch == ' ') { whole.Append(ch); } else if (ch == '(') { bAreaCodeStarted = true; whole.Append(ch); } else if (ch == ')') { if (bAreaCodeStarted) { if (segment.Length > 0) { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } segment = new StringBuilder(); whole.Append(ch); } else if (ch == '-') { if (bCountryCodeStarted) { if (segment.Length > 0) { phone.CountryCode = segment.ToString(); } bCountryCodeStarted = false; bAreaCodeStarted = true; } else if (bAreaCodeStarted) { if (segment.Length > 0) { phone.AreaCode = segment.ToString(); } bAreaCodeStarted = false; } else if (segment.Length > 0) { AssignPhoneMain(segment, phone); } whole.Append(ch); segment = new StringBuilder(); } else if (ch == '+') { bCountryCodeStarted = true; whole.Append(ch); } else if (ch == '.') { if (segment.ToString() != "ext") { break; } whole.Append("ext."); } else if (ch == 'e' || ch == 'x' || ch == 't') { segment.Append(ch); } i++; } if (segment.Length > 0) { AssignPhoneMain(segment, phone); if (bExtStarted) { phone.Extension = segment.ToString(); bExtStarted = false; } } } else { throw new NotImplementedException("Phone No. in " + _pattern.ToString() + " is not implemented in the parser."); } if (whole.Length > 0 && phone.Main != null) { prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.A_M, phone)); } return(prc); }
public ParseResultCollection Parse(int startIndex) { char[] chars = context.Text.ToArray(); ParseResultCollection prc = new ParseResultCollection(); int i=startIndex; StringBuilder sb = new StringBuilder(); if (chars[i] == '这'||chars[i] == '那') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '些' || nextchar == '里'|| nextchar=='儿') sb.Append(nextchar); } } else if (chars[i] == '你'||chars[i] == '我' || chars[i] == '他' || chars[i] == '它' || chars[i] == '她'||chars[i]=='咱') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '们') sb.Append(nextchar); } } else if (chars[i] == '谁' || chars[i] == '朕' || chars[i] == '此' || chars[i] == '彼') { sb = new StringBuilder(); sb.Append(chars[i]); } else if (chars[i] == '大') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '家') sb.Append("大家"); } else if (chars[i] == '什') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '么') sb.Append("什么"); } else if (chars[i] == '自') { sb = new StringBuilder(); if (i + 1 < context.Text.Length && chars[i + 1] == '己') sb.Append("自己"); } else if (chars[i] == '哪') { sb = new StringBuilder(); sb.Append(chars[i]); if (i + 1 < context.Text.Length) { char nextchar = chars[i + 1]; if (nextchar == '里') sb.Append(nextchar); } } if (sb.Length > 0) { ParseResult pr = new ParseResult(); pr.StartPos = i; pr.Text = sb.ToString(); pr.Type = POSType.D_R; prc.Add(pr); sb = new StringBuilder(); } return prc; }
public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseAddressLength, _text.Length - startIndex)); char[] chars = temp.ToCharArray(); //int lastStartPos = 0; StringBuilder sb = new StringBuilder(); StringBuilder whole = new StringBuilder(); ChineseAddress ca = new ChineseAddress(); int startpos = 0; //TODO: 通过字典找国家名 if (temp.StartsWith("中国")) { startpos = 2; ca.country = "中国"; whole.Append("中国"); } for (int i = startpos; i < chars.Length; i++) { char ch = chars[i]; if (ch == '市' || ch == '场') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string city = GetMaximumMatch(subStr, 0, 5); if (city != null) { ca.city = city; whole.Append(ca.city); sb = new StringBuilder(); } } else if (ch == '区') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string district = GetMaximumMatch(subStr, 0, 5); if (district != null) { if (!district.EndsWith("区")) { ca.city = district; whole.Append(ca.city); ca.district = subStr.Substring(ca.city.Length); whole.Append(ca.district); } else { //string district = NEParser.GetMaximumMatch(subStr, 0, 5, "district", _cityNames, null); ca.district = district; whole.Append(ca.district); } } else { ca.district = subStr; whole.Append(ca.district); } sb = new StringBuilder(); } else if (ch == '省') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); string subStr = sb.ToString(); string province = GetMaximumMatch(subStr, 0, 5); //省份 if (province != null) { ca.province = province; whole.Append(ca.province); sb = new StringBuilder(); } } else if (ch == '乡' || ch == '村' || ch == '县' || ch == '镇') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.county = sb.ToString(); whole.Append(ca.county); sb = new StringBuilder(); } else if (ch == '巷') { } else if (ch == '楼' || ch == '弄' || ch == '号' || ch == '室') { if (sb.Length == 0) { sb.Append(ch); continue; } string substr = NumeralUtil.ConvertChineseNumeral2Arabic(sb.ToString()); int x; sb.Append(ch); if (Int32.TryParse(substr, out x)) { if (ch == '楼') { ca.floor = sb.ToString(); } else if (ch == '弄') { ca.lane = sb.ToString(); } else if (ch == '号') { ca.no = sb.ToString(); } else if (ch == '室') { ca.room = sb.ToString(); } whole.Append(sb.ToString()); sb = new StringBuilder(); } } else if (ch == '道' || ch == '路' || ch == '街') { if (sb.Length == 0) { sb.Append(ch); continue; } sb.Append(ch); ca.street = sb.ToString(); whole.Append(ca.street); sb = new StringBuilder(); } else if (ch == '(' || ch == '(') { sb = new StringBuilder(); sb.Append(ch); } else if (ch == ')' || ch == ')') { sb.Append(ch); string extra1 = sb.ToString(); whole.Append(extra1); ca.extra = extra1; sb = new StringBuilder(); } else if (CharacterUtil.IsChinesePunctuation(ch) || (ch == ' ' || ch == ' ')) { break; } else if (ch == '大') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '桥' || nextchar == '厦') { string extra1 = sb.ToString() + "大" + nextchar; whole.Append(extra1); if (nextchar == '桥') { ca.extra += extra1; } else { ca.building = extra1; } i += 2 - 1; sb = new StringBuilder(); } else if (i + 2 < chars.Length && nextchar == '酒') { char nextchar2 = chars[i + 2]; if (nextchar2 == '店') { string extra1 = sb.ToString() + "大" + nextchar + nextchar2; string city = GetMaximumMatch(extra1, 0, 5); //城市或省份 if (city != null) { ca.city = city; whole.Append(ca.city); extra1 = extra1.Substring(ca.city.Length); } whole.Append(extra1); ca.building = extra1; i += 3 - 1; sb = new StringBuilder(); } } } } else if (ch == '餐') { if (sb.Length == 0) { sb.Append(ch); continue; } if (i + 1 < chars.Length) { char nextchar = chars[i + 1]; if (nextchar == '厅') { string extra1 = sb.ToString() + "餐" + nextchar; whole.Append(extra1); ca.extra += extra1; i += 2 - 1; sb = new StringBuilder(); } } } else { //if (sb.Length == 0) // lastStartPos = i; sb.Append(ch); string extra = sb.ToString(); if (extra.EndsWith("中心") || extra.EndsWith("酒店")) { string city = GetMaximumMatch(extra, 0, 5); //城市 if (city != null) { ca.city = city; extra = extra.Substring(city.Length); } ca.building = extra; whole.Append(extra); if (i + 2 < chars.Length && chars[i + 1] == '大' && chars[i + 2] == '厦') //处理 "中心大厦" { ca.building += "大厦"; whole.Append("大厦"); i += 2; sb = new StringBuilder(); continue; } sb = new StringBuilder(); } } } if (whole.Length > 0) { if (sb.Length > 0) { ca.extra = sb.ToString(); } prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.D_S, ca)); } return(prc); }
//public static ParseResultCollection Parse(string text) //{ // return ParseResultCollection.InternalParse(text, new OrgNameParser(text)); //} public ParseResultCollection Parse(int startIndex) { string _text = context.Text; ParseResultCollection prc = new ParseResultCollection(); string temp = _text.Substring(startIndex, Math.Min(maxChineseOrgNameLength, _text.Length - startIndex)); int pos = -1; string suffix = null; for (int i = 0; i < suffixList.Length; i++) { pos = temp.IndexOf(suffixList[i]); if (pos > 0) { suffix = suffixList[i]; break; } } if (pos <= 0) //找不到后缀,直接返回 { return(prc); } //寻找前置地名 string placeName = null; ParserContext context1 = this.context.Clone(); context1.Text = temp; IParser placeNameParser = new PlaceNameParser(context1); ParseResultCollection prc1 = placeNameParser.Parse(0); if (prc1.Count > 0) { placeName = (string)prc1[0].Text; } if (placeName != null && pos - placeName.Length < maxMiddlePartLength) { prc.Add(ParseResult.Create(temp.Substring(0, pos + suffix.Length), startIndex, POSType.A_NT)); } else if (context.Text.IndexOf("(") > 0) { int bracePos = context.Text.IndexOf("("); IParser placeNameParser2 = new PlaceNameParser(context); ParseResultCollection prc2 = placeNameParser2.Parse(bracePos + 1); if (prc2.Count > 0) { placeName = (string)prc2[0].Text; prc.Add(ParseResult.Create(temp.Substring(0, pos + suffix.Length), startIndex, POSType.A_NT)); } } else { //没有找到地名 string orgName = MatchOrgName(temp, 0); if (orgName != null) { prc.Add(ParseResult.Create(orgName, startIndex, POSType.A_NT)); } else { //库中没有,使用谓词定位边界 } } return(prc); /* * 《现代汉语词汇研究-中文信息处理》 * 确定规则 * a. 如果候选地名字符串前一词为地名指界词,且候选地名字串后一个词为地名特征词,则候选地名左右边界确定 * b. 如果候选地名字符串前一词为地名指界词,则候选地名左边界确定 * c. 如果候选地名字串后一个词为地名指界词,则候选地名右边界确定 * d. 如果两个候选地名字串存在并列关系, 其中一个候选地名被确定,则另一个候选地名也被确定 * 否定规则 * 称谓词否定规则:如果候选地名字串的前一词是人名称谓词,且候选地名字串中没有地名特征词,否定该地名字串。 * 指界词否定规则:如果候选地名字串的后一词为人名指界词,且候选地名字串中没有地名特征词,否定该地名字串。 * 并列否定规则:如果两个候选地名字串存在并列关系,其中一个候选地名被否定,另一个候选地名也被否定。 * 其他物体类否定规则:如果候选地名字符串的后一词为其他物体类特征词,否定该地名字串。如红塔山香烟 * 非单字词否定规则:如果候选地名字串的前一词不是单字词,或候选地名字串的后一词不是单字词,则否定候选地名 * 边界修正规则 * 称谓词与特征词修正规则:如果候选地名字串的前一词为人名称谓词且候选地名字串中存在地名特征词,则修正地名的边界 */ }
public ParseResultCollection Parse(int startIndex) { string text = NumeralUtil.ConvertChineseNumeral2Arabic(context.Text); ParseResultCollection prc = new ParseResultCollection(); int boundary = Math.Min(maxDateTimeTextLength, text.Length - startIndex); string temp = text.Substring(startIndex, boundary); StringBuilder sbDateText = new StringBuilder(); StringBuilder sbPatternText = new StringBuilder(); StringBuilder sbText = new StringBuilder(); int strLen = 0; int i; char prevCh=' '; bool nonNumeric = false; for (i = 0; i < boundary; i++) { char ch = temp[i]; if (NumeralUtil.IsArabicNumeral(ch)) { sbDateText.Append(ch); sbText.Append(ch); strLen++; } else if (ch == '大' || ch == '前' || ch == '昨' || ch == '明' || ch == '今' || ch == '后'|| ch == '去') { } else if (ch == '周') { if (prevCh == '上') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '天') { if (prevCh == '前' || prevCh == '昨' || prevCh == '明' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } } else if (ch == '年') { if (prevCh == '去' || prevCh == '前' || prevCh == '今' || prevCh == '后') { nonNumeric = true; sbText.Append(prevCh); sbText.Append(ch); break; } if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('y', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '日') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('d', strLen)); sbPatternText.Append(ch); strLen = 0; sbText.Append(ch); } else if (ch == '月') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('M', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '分') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('m', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '秒') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('s', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == '点') { if (strLen == 0) return prc; sbDateText.Append(ch); sbPatternText.Append(DateUtil.GeneratePatternText('h', strLen)); sbPatternText.Append(ch); sbText.Append(ch); strLen = 0; } else if (ch == ' ') { sbText.Append(ch); continue; } else { break; } prevCh = ch; } if (sbText.Length >0 &&nonNumeric== true) { prc.Add(ParseResult.Create(sbText.ToString(),startIndex,POSType.D_T)); return prc; } if (sbDateText.Length == 0 || sbPatternText.Length == 0) { return prc; } DateTime? dt = DateUtil.ParseDate(sbDateText.ToString(), sbPatternText.ToString()); if (dt != null) { string result=sbText.ToString(); prc.Add(ParseResult.Create(result,startIndex,POSType.D_T,dt)); } return prc; }