コード例 #1
0
ファイル: IAdapterRow.cs プロジェクト: TI-Russia/smart_parser
        public static bool CheckPersonName(String s)
        {
            if (s.Contains('.'))
            {
                return(true);
            }

            bool hasSpaces = s.Trim().Any(Char.IsWhiteSpace);

            if (!hasSpaces)
            {
                return(false);
            }
            string[] words = Regex.Split(s, @"[\,\s\n]+");
            if (TextHelpers.CanBePatronymic(words[words.Length - 1]))
            {
                return(true);
            }
            if (words.Count() != 3)
            {
                var predictedField = ColumnByDataPredictor.PredictByString(s);
                if (!HeaderHelpers.IsNameDeclarationField(predictedField))
                {
                    return(false);
                }
            }
            return(true);
        }
コード例 #2
0
        void DivideNameAndOccupation()
        {
            var nameCell = GetDeclarationField(DeclarationField.NameAndOccupationOrRelativeType);

            NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col);

            string v = nameCell.GetText(true);

            if (DataHelper.IsEmptyValue(v))
            {
                return;
            }
            if (DataHelper.IsRelativeInfo(v))
            {
                SetRelative(v);
            }
            else
            {
                string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation
                v = Regex.Replace(v, @"\d+\.\s+", "");
                string[] two_parts = Regex.Split(v, pattern);
                string   clean_v   = Regex.Replace(v, pattern, " ");
                string[] words     = Regex.Split(clean_v, @"[\,\s\n]+");

                if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) &&
                    !TextHelpers.MayContainsRole(words[0]) &&
                    !TextHelpers.MayContainsRole(words[1]))
                {
                    // ex: "Рутенберг Дмитрий Анатольевич начальник управления"
                    PersonName = String.Join(" ", words.Take(3)).Trim();
                    Occupation = String.Join(" ", words.Skip(3)).Trim();
                }
                else if (TextHelpers.CanBePatronymic(words.Last()))
                {
                    // ex: "начальник управления Рутенберг Дмитрий Анатольевич"
                    PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim();
                    Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim();
                }
                else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) && TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim()))
                {
                    // ex: "Головачева Н.В., заместитель"
                    PersonName = String.Join(" ", words.Take(2)).Trim();
                    Occupation = String.Join(" ", words.Skip(2)).Trim();
                }
                else if (two_parts.Length == 2)
                {
                    PersonName = two_parts[0].Trim();
                    Occupation = String.Join(" - ", two_parts.Skip(1)).Trim();
                }
                else
                {
                    throw new SmartParserException(
                              string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex()));
                }
            }
        }
コード例 #3
0
        public static DeclarationField PredictEmptyColumnTitle(IAdapter adapter, Cell headerCell)
        {
            List <string> texts           = new List <string>();
            int           rowIndex        = headerCell.Row + headerCell.MergedRowsCount;
            const int     maxRowToCollect = 10;
            int           numbers         = 0;
            int           not_numbers     = 0;

            for (int i = 0; i < maxRowToCollect; i++)
            {
                var    cells = adapter.GetDataCells(rowIndex, IAdapter.MaxColumnsCount);
                string dummy;
                if (adapter.IsSectionRow(rowIndex, cells, adapter.GetColsCount(), false, out dummy))
                {
                    rowIndex += 1;
                }
                else if (IAdapter.IsNumbersRow(cells))
                {
                    rowIndex += 1;
                }
                else
                {
                    var c = adapter.GetCell(rowIndex, headerCell.Col);
                    if (c != null)
                    {
                        var txt = c.GetText(true);
                        if (txt.Length > 0)
                        {
                            texts.Add(txt);
                            int d;
                            if (int.TryParse(txt, out d))
                            {
                                numbers += 1;
                            }
                            else
                            {
                                not_numbers += 1;
                            }
                        }
                        rowIndex += c.MergedRowsCount;
                    }
                    else
                    {
                        rowIndex += 1;
                    }
                }
                if (rowIndex >= adapter.GetRowsCount())
                {
                    break;
                }
            }
            var field = DeclarationField.None;

            if (texts.Count == 1 && headerCell.Col == 0 && TextHelpers.CanBePatronymic(texts[0]))
            {
                // not enough data, if texts.Count == 1
                field = DeclarationField.NameOrRelativeType;
            }
            else if (headerCell.Col == 0 && numbers > not_numbers)
            {
                field = DeclarationField.DeclarantIndex;
            }
            else
            {
                field = PredictByStrings(texts);
                if (field == DeclarationField.NameOrRelativeType && String.Join(" ", texts).Contains(","))
                {
                    field = DeclarationField.NameAndOccupationOrRelativeType;
                }
            }

            if (headerCell.TextAbove != null && ((field & DeclarationField.AllOwnTypes) > 0))
            {
                string h = headerCell.TextAbove;
                // AllOwnTypes defined from
                field &= ~DeclarationField.AllOwnTypes;
                if (HeaderHelpers.IsMixedColumn(h))
                {
                    field |= DeclarationField.Mixed;
                }
                else if (HeaderHelpers.IsStateColumn(h))
                {
                    field |= DeclarationField.State;
                }
                else if (HeaderHelpers.IsOwnedColumn(h))
                {
                    field |= DeclarationField.Owned;
                }
            }
            if (field == DeclarationField.NameOrRelativeType)
            {
                if (TextHelpers.MayContainsRole(String.Join(" ", texts)))
                {
                    field = DeclarationField.NameAndOccupationOrRelativeType;
                }
            }
            Logger.Debug(string.Format("predict by {0}  -> {1}",
                                       String.Join("\\n", texts), field));
            return(field);
        }
コード例 #4
0
        public static bool CheckMergeRow(List <string> row1, List <string> row2)
        {
            if (row1.Count != row2.Count)
            {
                return(false);
            }
            for (int i = 0; i < row1.Count; ++i)
            {
                var tokens1 = TokenizeCellText(row1[i]);
                var tokens2 = TokenizeCellText(row2[i]);
                if (tokens1.Count > 0 && tokens2.Count > 0)
                {
                    string lastWord  = tokens1.Last();
                    string firstWord = tokens2.First();
                    if (lastWord.Length > 0 && firstWord.Length > 0)
                    {
                        string joinExplanation = "";
                        if (Bigrams.ContainsKey(lastWord + " " + firstWord))
                        {
                            joinExplanation = "frequent bigram";
                        }

                        if (Regex.Matches(lastWord, @".+\p{Pd}$").Count > 0 &&
                            Char.IsLower(firstWord[0])
                            )
                        {
                            joinExplanation = "word break regexp";
                        }

                        if (tokens1.Count + tokens2.Count == 3 &&
                            TextHelpers.CanBePatronymic(tokens2[tokens2.Count - 1]) &&
                            !tokens2[tokens2.Count - 1].Contains('.') &&
                            Char.IsUpper(tokens1[0][0])
                            )
                        {
                            joinExplanation = "person regexp";
                        }

                        if (TextHelpers.MayContainsRole(string.Join(" ", tokens1)) &&
                            TextHelpers.CanBePatronymic(tokens2.Last()) &&
                            Char.IsUpper(tokens2[0][0]) &&
                            tokens1.All(x => !TextHelpers.CanBePatronymic(x))
                            )
                        {
                            joinExplanation = "role and person regexp";
                        }

                        if (Regex.Match(string.Join(" ", tokens1), @".+\([^\)]+", RegexOptions.Singleline).Success&&
                            Regex.Match(string.Join(" ", tokens2), @"^[^\(]+\).*", RegexOptions.Singleline).Success)
                        {
                            joinExplanation = "non-closed ) regexp";
                        }

                        if (firstWord.Trim()[0] == '(')
                        {
                            joinExplanation = "open ( regexp";
                        }

                        if (joinExplanation != "")
                        {
                            Logger.Debug(string.Format(
                                             "Join rows using {0} on cells \"{1}\" and \"{2}\"",
                                             joinExplanation,
                                             row1[i].ReplaceEolnWithSpace(),
                                             row2[i].ReplaceEolnWithSpace()));
                            return(true);
                        }
                    }
                }
            }
            return(false);
        }
コード例 #5
0
ファイル: IAdapterRow.cs プロジェクト: TI-Russia/smart_parser
        bool DivideNameAndOccupation(Cell nameCell)
        {
            NameDocPosition = adapter.GetDocumentPosition(GetRowIndex(), nameCell.Col);

            string v = nameCell.GetText(true);

            if (DataHelper.IsEmptyValue(v))
            {
                return(true);
            }
            if (DataHelper.IsRelativeInfo(v))
            {
                SetRelative(v);
            }
            else
            {
                string pattern = @"\s+\p{Pd}\s+"; // UnicodeCategory.DashPunctuation
                v = Regex.Replace(v, @"\d+\.\s+", "");
                string[] two_parts = Regex.Split(v, pattern);
                string   clean_v   = Regex.Replace(v, pattern, " ");
                string[] words     = Regex.Split(clean_v, @"[\,\s\n]+");

                if (words.Length >= 3 && TextHelpers.CanBePatronymic(words[2]) &&
                    !TextHelpers.MayContainsRole(words[0]) &&
                    !TextHelpers.MayContainsRole(words[1]))
                {
                    // ex: "Рутенберг Дмитрий Анатольевич начальник управления"
                    PersonName = String.Join(" ", words.Take(3)).Trim();
                    Occupation = String.Join(" ", words.Skip(3)).Trim();
                }
                else if (TextHelpers.CanBePatronymic(words.Last()))
                {
                    // ex: "начальник управления Рутенберг Дмитрий Анатольевич"
                    PersonName = String.Join(" ", words.Skip(words.Length - 3)).Trim();
                    Occupation = String.Join(" ", words.Take(words.Length - 3)).Trim();
                }
                else if (Regex.Match(v, @"\w\.\w\.,").Success)
                {
                    // ex: "Головачева Н.В., заместитель"
                    var match = Regex.Match(v, @"\w\.\w\.,");
                    PersonName = v.Substring(0, match.Index + match.Length - 1).Trim();
                    Occupation = v.Substring(match.Index + match.Length).Trim();
                }
                else if (words.Length >= 2 && TextHelpers.CanBeInitials(words[1]) &&
                         TextHelpers.MayContainsRole(String.Join(" ", words.Skip(2)).Trim()))
                {
                    // ex: "Головачева Н.В., заместитель"
                    PersonName = String.Join(" ", words.Take(2)).Trim();
                    Occupation = String.Join(" ", words.Skip(2)).Trim();
                }
                else if (two_parts.Length == 2)
                {
                    PersonName = two_parts[0].Trim();
                    Occupation = String.Join(" - ", two_parts.Skip(1)).Trim();
                }
                else
                {
                    // maybe PDF has split cells (table on different pages)
                    // example file: "5966/14 Upravlenie delami.pdf" converted to docx
                    Logger.Error(string.Format("Cannot parse name+occupation value {0} at row {1}", v, GetRowIndex()));
                    return(false);
                }
            }
            return(true);
        }