public static bool CheckPersonName(String s) { if (s.Contains('.')) { return(true); } bool hasSpaces = s.Trim().Any(Char.IsWhiteSpace); if (!hasSpaces) { return(false); } string[] words = Regex.Split(s, @"[\,\s\n]+"); if (TextHelpers.CanBePatronymic(words[words.Length - 1])) { return(true); } if (words.Count() != 3) { var predictedField = ColumnByDataPredictor.PredictByString(s); if (!HeaderHelpers.IsNameDeclarationField(predictedField)) { return(false); } } return(true); }
void DivideNameAndOccupation() { var nameCell = GetDeclarationField(DeclarationField.NameAndOccupationOrRelativeType); NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col); string v = nameCell.GetText(true); if (DataHelper.IsEmptyValue(v)) { return; } if (DataHelper.IsRelativeInfo(v)) { SetRelative(v); } else { string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation v = Regex.Replace(v, @"\d+\.\s+", ""); string[] two_parts = Regex.Split(v, pattern); string clean_v = Regex.Replace(v, pattern, " "); string[] words = Regex.Split(clean_v, @"[\,\s\n]+"); if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) && !TextHelpers.MayContainsRole(words[0]) && !TextHelpers.MayContainsRole(words[1])) { // ex: "Рутенберг Дмитрий Анатольевич начальник управления" PersonName = String.Join(" ", words.Take(3)).Trim(); Occupation = String.Join(" ", words.Skip(3)).Trim(); } else if (TextHelpers.CanBePatronymic(words.Last())) { // ex: "начальник управления Рутенберг Дмитрий Анатольевич" PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim(); Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim(); } else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) && TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim())) { // ex: "Головачева Н.В., заместитель" PersonName = String.Join(" ", words.Take(2)).Trim(); Occupation = String.Join(" ", words.Skip(2)).Trim(); } else if (two_parts.Length == 2) { PersonName = two_parts[0].Trim(); Occupation = String.Join(" - ", two_parts.Skip(1)).Trim(); } else { throw new SmartParserException( string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex())); } } }
public static DeclarationField PredictEmptyColumnTitle(IAdapter adapter, Cell headerCell) { List <string> texts = new List <string>(); int rowIndex = headerCell.Row + headerCell.MergedRowsCount; const int maxRowToCollect = 10; int numbers = 0; int not_numbers = 0; for (int i = 0; i < maxRowToCollect; i++) { var cells = adapter.GetDataCells(rowIndex, IAdapter.MaxColumnsCount); string dummy; if (adapter.IsSectionRow(rowIndex, cells, adapter.GetColsCount(), false, out dummy)) { rowIndex += 1; } else if (IAdapter.IsNumbersRow(cells)) { rowIndex += 1; } else { var c = adapter.GetCell(rowIndex, headerCell.Col); if (c != null) { var txt = c.GetText(true); if (txt.Length > 0) { texts.Add(txt); int d; if (int.TryParse(txt, out d)) { numbers += 1; } else { not_numbers += 1; } } rowIndex += c.MergedRowsCount; } else { rowIndex += 1; } } if (rowIndex >= adapter.GetRowsCount()) { break; } } var field = DeclarationField.None; if (texts.Count == 1 && headerCell.Col == 0 && TextHelpers.CanBePatronymic(texts[0])) { // not enough data, if texts.Count == 1 field = DeclarationField.NameOrRelativeType; } else if (headerCell.Col == 0 && numbers > not_numbers) { field = DeclarationField.DeclarantIndex; } else { field = PredictByStrings(texts); if (field == DeclarationField.NameOrRelativeType && String.Join(" ", texts).Contains(",")) { field = DeclarationField.NameAndOccupationOrRelativeType; } } if (headerCell.TextAbove != null && ((field & DeclarationField.AllOwnTypes) > 0)) { string h = headerCell.TextAbove; // AllOwnTypes defined from field &= ~DeclarationField.AllOwnTypes; if (HeaderHelpers.IsMixedColumn(h)) { field |= DeclarationField.Mixed; } else if (HeaderHelpers.IsStateColumn(h)) { field |= DeclarationField.State; } else if (HeaderHelpers.IsOwnedColumn(h)) { field |= DeclarationField.Owned; } } if (field == DeclarationField.NameOrRelativeType) { if (TextHelpers.MayContainsRole(String.Join(" ", texts))) { field = DeclarationField.NameAndOccupationOrRelativeType; } } Logger.Debug(string.Format("predict by {0} -> {1}", String.Join("\\n", texts), field)); return(field); }
public static bool CheckMergeRow(List <string> row1, List <string> row2) { if (row1.Count != row2.Count) { return(false); } for (int i = 0; i < row1.Count; ++i) { var tokens1 = TokenizeCellText(row1[i]); var tokens2 = TokenizeCellText(row2[i]); if (tokens1.Count > 0 && tokens2.Count > 0) { string lastWord = tokens1.Last(); string firstWord = tokens2.First(); if (lastWord.Length > 0 && firstWord.Length > 0) { string joinExplanation = ""; if (Bigrams.ContainsKey(lastWord + " " + firstWord)) { joinExplanation = "frequent bigram"; } if (Regex.Matches(lastWord, @".+\p{Pd}$").Count > 0 && Char.IsLower(firstWord[0]) ) { joinExplanation = "word break regexp"; } if (tokens1.Count + tokens2.Count == 3 && TextHelpers.CanBePatronymic(tokens2[tokens2.Count - 1]) && !tokens2[tokens2.Count - 1].Contains('.') && Char.IsUpper(tokens1[0][0]) ) { joinExplanation = "person regexp"; } if (TextHelpers.MayContainsRole(string.Join(" ", tokens1)) && TextHelpers.CanBePatronymic(tokens2.Last()) && Char.IsUpper(tokens2[0][0]) && tokens1.All(x => !TextHelpers.CanBePatronymic(x)) ) { joinExplanation = "role and person regexp"; } if (Regex.Match(string.Join(" ", tokens1), @".+\([^\)]+", RegexOptions.Singleline).Success&& Regex.Match(string.Join(" ", tokens2), @"^[^\(]+\).*", RegexOptions.Singleline).Success) { joinExplanation = "non-closed ) regexp"; } if (firstWord.Trim()[0] == '(') { joinExplanation = "open ( regexp"; } if (joinExplanation != "") { Logger.Debug(string.Format( "Join rows using {0} on cells \"{1}\" and \"{2}\"", joinExplanation, row1[i].ReplaceEolnWithSpace(), row2[i].ReplaceEolnWithSpace())); return(true); } } } } return(false); }
bool DivideNameAndOccupation(Cell nameCell) { NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col); string v = nameCell.GetText(true); if (DataHelper.IsEmptyValue(v)) { return(true); } if (DataHelper.IsRelativeInfo(v)) { SetRelative(v); } else { string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation v = Regex.Replace(v, @"\d+\.\s+", ""); string[] two_parts = Regex.Split(v, pattern); string clean_v = Regex.Replace(v, pattern, " "); string[] words = Regex.Split(clean_v, @"[\,\s\n]+"); if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) && !TextHelpers.MayContainsRole(words[0]) && !TextHelpers.MayContainsRole(words[1])) { // ex: "Рутенберг Дмитрий Анатольевич начальник управления" PersonName = String.Join(" ", words.Take(3)).Trim(); Occupation = String.Join(" ", words.Skip(3)).Trim(); } else if (TextHelpers.CanBePatronymic(words.Last())) { // ex: "начальник управления Рутенберг Дмитрий Анатольевич" PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim(); Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim(); } else if (Regex.Match(v, @"\w\.\w\.,").Success) { // ex: "Головачева Н.В., заместитель" var match = Regex.Match(v, @"\w\.\w\.,"); PersonName = v.Substring(0, match.Index + match.Length - 1).Trim(); Occupation = v.Substring(match.Index + match.Length).Trim(); } else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) && TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim())) { // ex: "Головачева Н.В., заместитель" PersonName = String.Join(" ", words.Take(2)).Trim(); Occupation = String.Join(" ", words.Skip(2)).Trim(); } else if (two_parts.Length == 2) { PersonName = two_parts[0].Trim(); Occupation = String.Join(" - ", two_parts.Skip(1)).Trim(); } else { // maybe PDF has split cells (table on different pages) // example file: "5966/14 Upravlenie delami.pdf" converted to docx Logger.Error(string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex())); return(false); } } return(true); }