Ejemplo n.º 1
0
        public ParseResultCollection Recognize(string text, ParserPattern pattern)
        {
            ParserContext context = new ParserContext();

            context.Pattern = pattern;
            context.Text    = text;

            ParseResultCollection result = new ParseResultCollection();

            char[] chars = text.ToCharArray();

            int i = 0;

            while (i < chars.Length)
            {
                char c = chars[i];

                if (CharacterUtil.IsChinesePunctuation(c))
                {
                    i++;
                    continue;
                }
                bool isFound = false;
                //扫描地名(优先于姓名,用于排除不正确人名)
                foreach (ConstructorInfo ci in parserConstructors)
                {
                    IParser parser = ci.Invoke(new object[] { context }) as IParser;

                    try
                    {
                        ParseResultCollection prc = parser.Parse(i);

                        if (prc.Count > 0)
                        {
                            foreach (ParseResult pr in prc)
                            {
                                result.Add(pr);
                                i += pr.Length;
                            }
                            isFound = true;
                            break;
                        }
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex);
                    }

                    if (!isFound)
                    {
                        i++;
                    }
                }
            }
            return(result);
        }
Ejemplo n.º 2
0
 int MatchPunctation(string text, int startIndex, int maxlength)
 {
     for (int i = startIndex; i < startIndex + maxlength; i++)
     {
         if (i + 1 < text.Length &&
             CharacterUtil.IsChinesePunctuation(text[i + 1]))
         {
             return(i + 1);
         }
     }
     return(-1);
 }
Ejemplo n.º 3
0
        public ParseResultCollection Parse(int startIndex)
        {
            string _text = context.Text;
            ParseResultCollection prc = new ParseResultCollection();
            //TODO:外国人中文姓名处理(无姓)

            //3 找前缀
            string prefix       = MatchPrefix(_text, startIndex);
            int    prefixlength = 0;

            if (prefix != null)
            {
                prefixlength = prefix.Length;
            }
            //1 扫描百家姓中的姓
            //查单字姓
            int    currentPos = startIndex + prefixlength;
            string surname    = MatchSurname(_text, currentPos);

            if (surname == null)
            {
                return(prc);
            }
            bool surnameInserted   = false;
            bool givennameInserted = false;

            if (prefix != null && surname != null)
            {
                prc.Add(ParseResult.Create(prefix, startIndex, POSType.D_N));    //前缀
                surnameInserted = true;
                prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR));
                currentPos += surname.Length;
            }
            //2 如果姓后面是标点符号,直接认为不是人名
            if (currentPos + 1 < _text.Length &&
                CharacterUtil.IsChinesePunctuation(_text[currentPos + 1]))
            {
                return(prc);
            }
            //1.1用最大匹配搜索库中的完整人名,如果匹配且权重很高,直接认为是人名
            //string fullname = MatchFullname(_text, startIndex);

            //if (fullname != null)
            //{
            //    prc.Add(ParseResult.Create(surname, startIndex, POSType.A_NR));
            //    prc.Add(ParseResult.Create(fullname.Substring(surname.Length), startIndex + surname.Length, POSType.A_NR));
            //    return prc;
            //}

            //3 找名字
            //TODO:缩小名字的范围,否则容易造成匹配错误
            //string givenname = MatchGivenname(_text, startIndex + surname.Length);
            //if (givenname != null)
            //{
            //    string suffix2 = MatchSuffix(_text, startIndex + surname.Length + givenname.Length, _siblingWordDB);
            //    if (suffix != null && givenname.Length <= suffix.Length)
            //    {
            //        givenname = null;
            //    }
            //    else
            //    {
            //        suffix = suffix2;
            //    }
            //}
            //4 如果后面是称谓,如先生、小姐、博士、医生,则认为是人名
            int resultStartPos = -1;

            if (surname != null)
            {
                resultStartPos = currentPos + (surnameInserted?0:surname.Length);
                string suffix = MatchSuffix(_text, resultStartPos, out resultStartPos);
                if (suffix != null)
                {
                    if (!surnameInserted)
                    {
                        prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR));
                        surnameInserted = true;
                        currentPos     += surname.Length;
                    }
                    if (resultStartPos > currentPos)
                    {
                        string givenname = _text.Substring(currentPos, resultStartPos - currentPos);
                        prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR));
                        prc.Add(ParseResult.Create(suffix, resultStartPos, POSType.D_N));
                        currentPos       += givenname.Length + suffix.Length;
                        givennameInserted = true;
                    }
                    else
                    {
                        prc.Add(ParseResult.Create(suffix, currentPos, POSType.D_N));
                        currentPos += suffix.Length;
                    }
                    return(prc);
                }
            }

            // 5 如果前面是动词、使动词,可认为是人名
            if (surname != null)
            {
                resultStartPos = currentPos + (surnameInserted ? 0 : surname.Length);
                bool verbFound = MatchVerb(_text, resultStartPos, out resultStartPos);
                if (verbFound && resultStartPos > currentPos + (surnameInserted ? 0 : surname.Length))
                {
                    if (!surnameInserted)
                    {
                        prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR));
                        surnameInserted = true;
                        currentPos     += surname.Length;
                    }
                    if (!givennameInserted)
                    {
                        string givenname = _text.Substring(currentPos, resultStartPos - currentPos);
                        prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR));
                        currentPos       += givenname.Length;
                        givennameInserted = true;
                    }
                }
            }
            if (surname != null)
            {
                //人名之后直接标点符号, 认为是人名
                int punctuationPos = MatchPunctation(_text, currentPos + (surnameInserted ? 0 : surname.Length), 4);
                if (punctuationPos > 0)
                {
                    if (!surnameInserted)
                    {
                        prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR));
                        surnameInserted = true;
                        currentPos     += surname.Length;
                    }
                    if (!givennameInserted)
                    {
                        string givenname = _text.Substring(currentPos, punctuationPos - currentPos);
                        prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR));
                        currentPos       += givenname.Length;
                        givennameInserted = true;
                    }
                }
            }
            if (surname != null && _text.Length - currentPos - surname.Length <= MaximumGivennameLength && _text.Length - currentPos - surname.Length > 0)  //姓名之后没有字的情况
            {
                if (!surnameInserted)
                {
                    prc.Add(ParseResult.Create(surname, currentPos, POSType.A_NR));
                    surnameInserted = true;
                    currentPos     += surname.Length;
                }
                if (!givennameInserted)
                {
                    string givenname = _text.Substring(currentPos, _text.Length - currentPos);
                    prc.Add(ParseResult.Create(givenname, currentPos, POSType.A_NR));
                    currentPos       += givenname.Length;
                    givennameInserted = true;
                }
            }
            return(prc);
        }
Ejemplo n.º 4
0
        public ParseResultCollection Parse(int startIndex)
        {
            string _text = context.Text;
            ParseResultCollection prc = new ParseResultCollection();
            string temp = _text.Substring(startIndex, Math.Min(maxChineseAddressLength, _text.Length - startIndex));

            char[] chars = temp.ToCharArray();
            //int lastStartPos = 0;
            StringBuilder  sb       = new StringBuilder();
            StringBuilder  whole    = new StringBuilder();
            ChineseAddress ca       = new ChineseAddress();
            int            startpos = 0;

            //TODO: 通过字典找国家名
            if (temp.StartsWith("中国"))
            {
                startpos   = 2;
                ca.country = "中国";
                whole.Append("中国");
            }
            for (int i = startpos; i < chars.Length; i++)
            {
                char ch = chars[i];
                if (ch == '市' || ch == '场')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    string subStr = sb.ToString();
                    string city   = GetMaximumMatch(subStr, 0, 5);
                    if (city != null)
                    {
                        ca.city = city;
                        whole.Append(ca.city);
                        sb = new StringBuilder();
                    }
                }
                else if (ch == '区')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    string subStr = sb.ToString();

                    string district = GetMaximumMatch(subStr, 0, 5);
                    if (district != null)
                    {
                        if (!district.EndsWith("区"))
                        {
                            ca.city = district;
                            whole.Append(ca.city);
                            ca.district = subStr.Substring(ca.city.Length);
                            whole.Append(ca.district);
                        }
                        else
                        {
                            //string district = NEParser.GetMaximumMatch(subStr, 0, 5, "district", _cityNames, null);
                            ca.district = district;
                            whole.Append(ca.district);
                        }
                    }
                    else
                    {
                        ca.district = subStr;
                        whole.Append(ca.district);
                    }
                    sb = new StringBuilder();
                }
                else if (ch == '省')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    string subStr   = sb.ToString();
                    string province = GetMaximumMatch(subStr, 0, 5);    //省份
                    if (province != null)
                    {
                        ca.province = province;
                        whole.Append(ca.province);
                        sb = new StringBuilder();
                    }
                }
                else if (ch == '乡' || ch == '村' || ch == '县' || ch == '镇')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    ca.county = sb.ToString();
                    whole.Append(ca.county);
                    sb = new StringBuilder();
                }
                else if (ch == '巷')
                {
                }
                else if (ch == '楼' || ch == '弄' || ch == '号' || ch == '室')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    string substr = NumeralUtil.ConvertChineseNumeral2Arabic(sb.ToString());
                    int    x;
                    sb.Append(ch);
                    if (Int32.TryParse(substr, out x))
                    {
                        if (ch == '楼')
                        {
                            ca.floor = sb.ToString();
                        }
                        else if (ch == '弄')
                        {
                            ca.lane = sb.ToString();
                        }
                        else if (ch == '号')
                        {
                            ca.no = sb.ToString();
                        }
                        else if (ch == '室')
                        {
                            ca.room = sb.ToString();
                        }
                        whole.Append(sb.ToString());
                        sb = new StringBuilder();
                    }
                }
                else if (ch == '道' || ch == '路' || ch == '街')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    sb.Append(ch);
                    ca.street = sb.ToString();
                    whole.Append(ca.street);
                    sb = new StringBuilder();
                }
                else if (ch == '(' || ch == '(')
                {
                    sb = new StringBuilder();
                    sb.Append(ch);
                }
                else if (ch == ')' || ch == ')')
                {
                    sb.Append(ch);
                    string extra1 = sb.ToString();
                    whole.Append(extra1);
                    ca.extra = extra1;
                    sb       = new StringBuilder();
                }
                else if (CharacterUtil.IsChinesePunctuation(ch) || (ch == ' ' || ch == ' '))
                {
                    break;
                }
                else if (ch == '大')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    if (i + 1 < chars.Length)
                    {
                        char nextchar = chars[i + 1];

                        if (nextchar == '桥' || nextchar == '厦')
                        {
                            string extra1 = sb.ToString() + "大" + nextchar;
                            whole.Append(extra1);
                            if (nextchar == '桥')
                            {
                                ca.extra += extra1;
                            }
                            else
                            {
                                ca.building = extra1;
                            }
                            i += 2 - 1;
                            sb = new StringBuilder();
                        }
                        else if (i + 2 < chars.Length && nextchar == '酒')
                        {
                            char nextchar2 = chars[i + 2];

                            if (nextchar2 == '店')
                            {
                                string extra1 = sb.ToString() + "大" + nextchar + nextchar2;
                                string city   = GetMaximumMatch(extra1, 0, 5);  //城市或省份
                                if (city != null)
                                {
                                    ca.city = city;
                                    whole.Append(ca.city);

                                    extra1 = extra1.Substring(ca.city.Length);
                                }
                                whole.Append(extra1);
                                ca.building = extra1;
                                i          += 3 - 1;
                                sb          = new StringBuilder();
                            }
                        }
                    }
                }
                else if (ch == '餐')
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(ch);
                        continue;
                    }
                    if (i + 1 < chars.Length)
                    {
                        char nextchar = chars[i + 1];
                        if (nextchar == '厅')
                        {
                            string extra1 = sb.ToString() + "餐" + nextchar;
                            whole.Append(extra1);
                            ca.extra += extra1;
                            i        += 2 - 1;
                            sb        = new StringBuilder();
                        }
                    }
                }
                else
                {
                    //if (sb.Length == 0)
                    //    lastStartPos = i;
                    sb.Append(ch);
                    string extra = sb.ToString();
                    if (extra.EndsWith("中心") || extra.EndsWith("酒店"))
                    {
                        string city = GetMaximumMatch(extra, 0, 5); //城市
                        if (city != null)
                        {
                            ca.city = city;
                            extra   = extra.Substring(city.Length);
                        }
                        ca.building = extra;
                        whole.Append(extra);
                        if (i + 2 < chars.Length && chars[i + 1] == '大' && chars[i + 2] == '厦')  //处理 "中心大厦"
                        {
                            ca.building += "大厦";
                            whole.Append("大厦");
                            i += 2;
                            sb = new StringBuilder();
                            continue;
                        }
                        sb = new StringBuilder();
                    }
                }
            }
            if (whole.Length > 0)
            {
                if (sb.Length > 0)
                {
                    ca.extra = sb.ToString();
                }
                prc.Add(ParseResult.Create(whole.ToString(), startIndex, POSType.D_S, ca));
            }
            return(prc);
        }