void DivideNameAndOccupation() { var nameCell = GetDeclarationField(DeclarationField.NameAndOccupationOrRelativeType); NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col); string v = nameCell.GetText(true); if (DataHelper.IsEmptyValue(v)) { return; } if (DataHelper.IsRelativeInfo(v)) { SetRelative(v); } else { string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation v = Regex.Replace(v, @"\d+\.\s+", ""); string[] two_parts = Regex.Split(v, pattern); string clean_v = Regex.Replace(v, pattern, " "); string[] words = Regex.Split(clean_v, @"[\,\s\n]+"); if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) && !TextHelpers.MayContainsRole(words[0]) && !TextHelpers.MayContainsRole(words[1])) { // ex: "Рутенберг Дмитрий Анатольевич начальник управления" PersonName = String.Join(" ", words.Take(3)).Trim(); Occupation = String.Join(" ", words.Skip(3)).Trim(); } else if (TextHelpers.CanBePatronymic(words.Last())) { // ex: "начальник управления Рутенберг Дмитрий Анатольевич" PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim(); Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim(); } else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) && TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim())) { // ex: "Головачева Н.В., заместитель" PersonName = String.Join(" ", words.Take(2)).Trim(); Occupation = String.Join(" ", words.Skip(2)).Trim(); } else if (two_parts.Length == 2) { PersonName = two_parts[0].Trim(); Occupation = String.Join(" - ", two_parts.Skip(1)).Trim(); } else { throw new SmartParserException( string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex())); } } }
bool DivideNameAndOccupation(Cell nameCell) { NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col); string v = nameCell.GetText(true); if (DataHelper.IsEmptyValue(v)) { return(true); } if (DataHelper.IsRelativeInfo(v)) { SetRelative(v); } else { string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation v = Regex.Replace(v, @"\d+\.\s+", ""); string[] two_parts = Regex.Split(v, pattern); string clean_v = Regex.Replace(v, pattern, " "); string[] words = Regex.Split(clean_v, @"[\,\s\n]+"); if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) && !TextHelpers.MayContainsRole(words[0]) && !TextHelpers.MayContainsRole(words[1])) { // ex: "Рутенберг Дмитрий Анатольевич начальник управления" PersonName = String.Join(" ", words.Take(3)).Trim(); Occupation = String.Join(" ", words.Skip(3)).Trim(); } else if (TextHelpers.CanBePatronymic(words.Last())) { // ex: "начальник управления Рутенберг Дмитрий Анатольевич" PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim(); Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim(); } else if (Regex.Match(v, @"\w\.\w\.,").Success) { // ex: "Головачева Н.В., заместитель" var match = Regex.Match(v, @"\w\.\w\.,"); PersonName = v.Substring(0, match.Index + match.Length - 1).Trim(); Occupation = v.Substring(match.Index + match.Length).Trim(); } else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) && TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim())) { // ex: "Головачева Н.В., заместитель" PersonName = String.Join(" ", words.Take(2)).Trim(); Occupation = String.Join(" ", words.Skip(2)).Trim(); } else if (two_parts.Length == 2) { PersonName = two_parts[0].Trim(); Occupation = String.Join(" - ", two_parts.Skip(1)).Trim(); } else { // maybe PDF has split cells (table on different pages) // example file: "5966/14 Upravlenie delami.pdf" converted to docx Logger.Error(string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex())); return(false); } } return(true); }