コード例 #1
0
        void DivideNameAndOccupation()
        {
            var nameCell = GetDeclarationField(DeclarationField.NameAndOccupationOrRelativeType);

            NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col);

            string v = nameCell.GetText(true);

            if (DataHelper.IsEmptyValue(v))
            {
                return;
            }
            if (DataHelper.IsRelativeInfo(v))
            {
                SetRelative(v);
            }
            else
            {
                string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation
                v = Regex.Replace(v, @"\d+\.\s+", "");
                string[] two_parts = Regex.Split(v, pattern);
                string   clean_v   = Regex.Replace(v, pattern, " ");
                string[] words     = Regex.Split(clean_v, @"[\,\s\n]+");

                if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) &&
                    !TextHelpers.MayContainsRole(words[0]) &&
                    !TextHelpers.MayContainsRole(words[1]))
                {
                    // ex: "Рутенберг Дмитрий Анатольевич начальник управления"
                    PersonName = String.Join(" ", words.Take(3)).Trim();
                    Occupation = String.Join(" ", words.Skip(3)).Trim();
                }
                else if (TextHelpers.CanBePatronymic(words.Last()))
                {
                    // ex: "начальник управления Рутенберг Дмитрий Анатольевич"
                    PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim();
                    Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim();
                }
                else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) && TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim()))
                {
                    // ex: "Головачева Н.В., заместитель"
                    PersonName = String.Join(" ", words.Take(2)).Trim();
                    Occupation = String.Join(" ", words.Skip(2)).Trim();
                }
                else if (two_parts.Length == 2)
                {
                    PersonName = two_parts[0].Trim();
                    Occupation = String.Join(" - ", two_parts.Skip(1)).Trim();
                }
                else
                {
                    throw new SmartParserException(
                              string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex()));
                }
            }
        }
コード例 #2
0
ファイル: IAdapterRow.cs プロジェクト: TI-Russia/smart_parser
        bool DivideNameAndOccupation(Cell nameCell)
        {
            NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col);

            string v = nameCell.GetText(true);

            if (DataHelper.IsEmptyValue(v))
            {
                return(true);
            }
            if (DataHelper.IsRelativeInfo(v))
            {
                SetRelative(v);
            }
            else
            {
                string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation
                v = Regex.Replace(v, @"\d+\.\s+", "");
                string[] two_parts = Regex.Split(v, pattern);
                string   clean_v   = Regex.Replace(v, pattern, " ");
                string[] words     = Regex.Split(clean_v, @"[\,\s\n]+");

                if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) &&
                    !TextHelpers.MayContainsRole(words[0]) &&
                    !TextHelpers.MayContainsRole(words[1]))
                {
                    // ex: "Рутенберг Дмитрий Анатольевич начальник управления"
                    PersonName = String.Join(" ", words.Take(3)).Trim();
                    Occupation = String.Join(" ", words.Skip(3)).Trim();
                }
                else if (TextHelpers.CanBePatronymic(words.Last()))
                {
                    // ex: "начальник управления Рутенберг Дмитрий Анатольевич"
                    PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim();
                    Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim();
                }
                else if (Regex.Match(v, @"\w\.\w\.,").Success)
                {
                    // ex: "Головачева Н.В., заместитель"
                    var match = Regex.Match(v, @"\w\.\w\.,");
                    PersonName = v.Substring(0, match.Index + match.Length - 1).Trim();
                    Occupation = v.Substring(match.Index + match.Length).Trim();
                }
                else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) &&
                         TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim()))
                {
                    // ex: "Головачева Н.В., заместитель"
                    PersonName = String.Join(" ", words.Take(2)).Trim();
                    Occupation = String.Join(" ", words.Skip(2)).Trim();
                }
                else if (two_parts.Length == 2)
                {
                    PersonName = two_parts[0].Trim();
                    Occupation = String.Join(" - ", two_parts.Skip(1)).Trim();
                }
                else
                {
                    // maybe PDF has split cells (table on different pages)
                    // example file: "5966/14 Upravlenie delami.pdf" converted to docx
                    Logger.Error(string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex()));
                    return(false);
                }
            }
            return(true);
        }