コード例 #1
0
        public static DeclarationField PredictByStrings(List <string> words)
        {
            var negativeFreqs = new Dictionary <DeclarationField, double>();

            foreach (string w in words)
            {
                if (DataHelper.IsEmptyValue(w))
                {
                    continue;
                }
                var f = HeaderHelpers.TryGetField("", w);
                if (f == DeclarationField.None)
                {
                    f = PredictByString(w);
                }
                if (negativeFreqs.ContainsKey(f))
                {
                    negativeFreqs[f] -= 1;
                }
                else
                {
                    negativeFreqs[f] = -1;
                }
            }
            return(FindMin(negativeFreqs));
        }
コード例 #2
0
ファイル: ReadHeader.cs プロジェクト: TI-Russia/smart_parser
        public static bool WeakHeaderCheck(IAdapter adapter, List <Cell> cells)
        {
            int colCount = 0;

            if (cells.Count < 3)
            {
                return(false);
            }
            foreach (var c in cells)
            {
                if (colCount == 0 && HeaderHelpers.IsNumeroSign(c.Text))
                {
                    return(true);
                }
                if (HeaderHelpers.IsName(c.Text))
                {
                    return(true);
                }
                if (HeaderHelpers.HasOwnedString(c.Text) || HeaderHelpers.HasStateString(c.Text))
                {
                    if (FindSubcellsUnder(adapter, c).Count >= 3)
                    {
                        return(true);
                    }
                }
                colCount += 1;
                if (colCount > 3)
                {
                    break;
                }
            }
            return(false);
        }
コード例 #3
0
ファイル: IAdapterRow.cs プロジェクト: TI-Russia/smart_parser
        public static bool CheckPersonName(String s)
        {
            if (s.Contains('.'))
            {
                return(true);
            }

            bool hasSpaces = s.Trim().Any(Char.IsWhiteSpace);

            if (!hasSpaces)
            {
                return(false);
            }
            string[] words = Regex.Split(s, @"[\,\s\n]+");
            if (TextHelpers.CanBePatronymic(words[words.Length - 1]))
            {
                return(true);
            }
            if (words.Count() != 3)
            {
                var predictedField = ColumnByDataPredictor.PredictByString(s);
                if (!HeaderHelpers.IsNameDeclarationField(predictedField))
                {
                    return(false);
                }
            }
            return(true);
        }
コード例 #4
0
ファイル: TableHeader.cs プロジェクト: TI-Russia/smart_parser
 public bool HasNameColumn()
 {
     foreach (var x in ColumnOrder.Keys)
     {
         if (HeaderHelpers.IsNameDeclarationField(x))
         {
             return(true);
         }
     }
     return(false);
 }
コード例 #5
0
ファイル: ReadHeader.cs プロジェクト: TI-Russia/smart_parser
        // special abridged format for Moscow courts, see sud_2016.doc in the test cases
        public static bool IsNamePositionAndIncomeTable(List <Cell> cells)
        {
            if (cells.Count != 3)
            {
                return(false);
            }

            return(HeaderHelpers.IsName(cells[0].Text) &&
                   HeaderHelpers.IsOccupation(cells[1].Text) &&
                   HeaderHelpers.IsDeclaredYearlyIncome(cells[2].Text));
        }
コード例 #6
0
ファイル: ReadHeader.cs プロジェクト: TI-Russia/smart_parser
        static public void MapColumnTitlesToInnerConstants(IAdapter adapter, List <Cell> cells, TableHeader columnOrdering)
        {
            foreach (var cell in cells)
            {
                string text = cell.GetText(true);
                Logger.Debug(string.Format("column title: \"{0}\"[{1}]", text.ReplaceEolnWithSpace().CoalesceWhitespace(), cell.CellWidth));
                DeclarationField field;
                string           clean_text = AbsenceMarkers.Aggregate(text, (x, y) => x.Replace(y, "")).Trim();

                if (adapter.GetRowsCount() == cell.MergedRowsCount)
                {
                    continue;
                }

                if ((text == "" || clean_text.Length <= 1) && (text != "№"))
                {
                    // too short title, try to predict by values
                    field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell);
                    Logger.Debug("Predict: " + field.ToString());
                }
                else
                {
                    field = HeaderHelpers.TryGetField(cell.TextAbove, text);
                    if ((field == DeclarationField.None) && clean_text.Length <= 4)
                    {
                        field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell);
                        Logger.Debug("Predict: " + field.ToString());
                    }
                    if (field == DeclarationField.None)
                    {
                        throw new SmartParserException(String.Format("Cannot recognize field \"{0}\"", text.Replace('\n', ' ')));
                    }
                }

                if (field == DeclarationField.None && !DataHelper.IsEmptyValue(text))
                {
                    throw new ColumnDetectorException(String.Format("Fail to detect column type row: {0} title:{1}", cell.Row, text));
                }
                if (ColumnByDataPredictor.CalcPrecision)
                {
                    ColumnByDataPredictor.PredictForPrecisionCheck(adapter, cell, field);
                }

                AddColumn(columnOrdering, field, cell);
                if (TableHeader.SearchForFioColumnOnly)
                {
                    if (HeaderHelpers.IsNameDeclarationField(field))
                    {
                        break;
                    }
                }
            }
        }
コード例 #7
0
        public static DeclarationField PredictEmptyColumnTitle(IAdapter adapter, Cell headerCell)
        {
            List <string> texts           = new List <string>();
            int           rowIndex        = headerCell.Row + headerCell.MergedRowsCount;
            const int     maxRowToCollect = 10;
            int           numbers         = 0;
            int           not_numbers     = 0;

            for (int i = 0; i < maxRowToCollect; i++)
            {
                var    cells = adapter.GetDataCells(rowIndex, IAdapter.MaxColumnsCount);
                string dummy;
                if (adapter.IsSectionRow(rowIndex, cells, adapter.GetColsCount(), false, out dummy))
                {
                    rowIndex += 1;
                }
                else if (IAdapter.IsNumbersRow(cells))
                {
                    rowIndex += 1;
                }
                else
                {
                    var c = adapter.GetCell(rowIndex, headerCell.Col);
                    if (c != null)
                    {
                        var txt = c.GetText(true);
                        if (txt.Length > 0)
                        {
                            texts.Add(txt);
                            int d;
                            if (int.TryParse(txt, out d))
                            {
                                numbers += 1;
                            }
                            else
                            {
                                not_numbers += 1;
                            }
                        }
                        rowIndex += c.MergedRowsCount;
                    }
                    else
                    {
                        rowIndex += 1;
                    }
                }
                if (rowIndex >= adapter.GetRowsCount())
                {
                    break;
                }
            }
            var field = DeclarationField.None;

            if (texts.Count == 1 && headerCell.Col == 0 && TextHelpers.CanBePatronymic(texts[0]))
            {
                // not enough data, if texts.Count == 1
                field = DeclarationField.NameOrRelativeType;
            }
            else if (headerCell.Col == 0 && numbers > not_numbers)
            {
                field = DeclarationField.DeclarantIndex;
            }
            else
            {
                field = PredictByStrings(texts);
                if (field == DeclarationField.NameOrRelativeType && String.Join(" ", texts).Contains(","))
                {
                    field = DeclarationField.NameAndOccupationOrRelativeType;
                }
            }

            if (headerCell.TextAbove != null && ((field & DeclarationField.AllOwnTypes) > 0))
            {
                string h = headerCell.TextAbove;
                // AllOwnTypes defined from
                field &= ~DeclarationField.AllOwnTypes;
                if (HeaderHelpers.IsMixedColumn(h))
                {
                    field |= DeclarationField.Mixed;
                }
                else if (HeaderHelpers.IsStateColumn(h))
                {
                    field |= DeclarationField.State;
                }
                else if (HeaderHelpers.IsOwnedColumn(h))
                {
                    field |= DeclarationField.Owned;
                }
            }
            if (field == DeclarationField.NameOrRelativeType)
            {
                if (TextHelpers.MayContainsRole(String.Join(" ", texts)))
                {
                    field = DeclarationField.NameAndOccupationOrRelativeType;
                }
            }
            Logger.Debug(string.Format("predict by {0}  -> {1}",
                                       String.Join("\\n", texts), field));
            return(field);
        }