Example #1
0
        public static void SaveRandomPortionToToloka(IAdapter adapter, SmartParser.Lib.TableHeader columnOrdering,
                                                     Declaration declaration, string inputFileName)
        {
            if (TolokaFileName == "")
            {
                return;
            }
            string fileID = BuildInputFileId(adapter, inputFileName);

            using (System.IO.StreamWriter file = new System.IO.StreamWriter(TolokaFileName))
            {
                file.WriteLine("INPUT:input_id\tINPUT:input_json\tGOLDEN:declaration_json\tHINT:text");
                Random random        = new Random();
                int    dataRowsCount = Math.Min(20, adapter.GetRowsCount() - columnOrdering.GetPossibleHeaderEnd());
                int    dataStart     = random.Next(columnOrdering.GetPossibleHeaderEnd(),
                                                   adapter.GetRowsCount() - dataRowsCount);
                int dataEnd = dataStart + dataRowsCount;
                var json    = adapter.TablePortionToJson(columnOrdering, dataStart, dataEnd);
                json.InputFileName = inputFileName;
                json.Title         = declaration.Properties.SheetTitle;
                string jsonStr = JsonConvert.SerializeObject(json);
                jsonStr = jsonStr.Replace("\t", " ").Replace("\\t", " ").Replace("\"", "\"\"");
                string id = fileID + "_" + dataStart + "_" + dataEnd;
                file.WriteLine(id + "\t" + "\"" + jsonStr + "\"\t\t");
            }
        }
Example #2
0
        static public TableHeader ExamineTableBeginning(IAdapter adapter)
        {
            TableHeader columnOrdering = new TableHeader();
            int         headerStartRow = ProcessTitle(adapter, columnOrdering);

            headerStartRow = Fix31832(adapter, headerStartRow);
            ReadHeader(adapter, headerStartRow, columnOrdering);
            return(columnOrdering);
        }
Example #3
0
        private PublicServant CreatePublicServant(TableHeader columnOrdering)
        {
            var currentDeclarant = new PublicServant();

            currentDeclarant.NameRaw  = GetPersonName();
            currentDeclarant.Ordering = columnOrdering;
            currentDeclarant.Index    = 1;
            return(currentDeclarant);
        }
Example #4
0
 public DataRow(IAdapter adapter, TableHeader columnOrdering, int row)
 {
     this.row            = row;
     this.adapter        = adapter;
     this.ColumnOrdering = columnOrdering;
     Cells = adapter.GetDataCells(row, columnOrdering.GetMaxColumnEndIndex());
     if (!this.adapter.IsExcel())
     {
         MapCells();
     }
 }
Example #5
0
 public void AddInputRowToCurrentPerson(TableHeader columnOrdering, DataRow row)
 {
     if (CurrentPerson != null)
     {
         if (!DivideDeclarantAndRelativesBySoftEolns(columnOrdering, row))
         {
             CurrentPerson.DateRows.Add(row);
             TransposeTableByRelatives(columnOrdering, row);
         }
     }
 }
Example #6
0
 static void FixBadColumnName02(TableHeader c)
 {
     //move NameAndOccupationOrRelativeType  to NameOrRelativeType if Occupation  is present
     if (c.ContainsField(DeclarationField.NameAndOccupationOrRelativeType) &&
         c.ContainsField(DeclarationField.Occupation)
         )
     {
         TColumnInfo s = c.ColumnOrder[DeclarationField.NameAndOccupationOrRelativeType];
         s.Field = DeclarationField.NameOrRelativeType;
         c.Add(s);
         c.Delete(DeclarationField.NameAndOccupationOrRelativeType);
     }
 }
Example #7
0
        static void DumpColumn(IAdapter adapter, SmartParser.Lib.TableHeader columnOrdering, DeclarationField columnToDump)
        {
            int rowOffset = columnOrdering.FirstDataRow;

            for (var row = rowOffset; row < adapter.GetRowsCount(); row++)
            {
                DataRow currRow = adapter.GetRow(columnOrdering, row);
                var     cell    = currRow.GetDeclarationField(columnToDump);
                var     s       = (cell == null) ? "null" : cell.GetText();
                s = s.Replace("\n", "\\n");
                Console.WriteLine(s);
            }
        }
Example #8
0
 static void FixMissingSubheadersForStateColumn(IAdapter adapter, TableHeader columnOrdering)
 {
     //see niz_kam.docx   in tests
     FixMissingSubheadersForMergedColumns(
         adapter,
         columnOrdering,
         DeclarationField.StateColumnWithNaturalText,
         new DeclarationField[] {
         DeclarationField.StatePropertyType,
         DeclarationField.StatePropertySquare,
         DeclarationField.StatePropertyCountry,
     }
         );
 }
Example #9
0
 static void FixMissingSubheadersForMixedRealEstate(IAdapter adapter, TableHeader columnOrdering)
 {
     //see DepEnergo2010.doc  in tests
     FixMissingSubheadersForMergedColumns(
         adapter,
         columnOrdering,
         DeclarationField.MixedColumnWithNaturalText,
         new DeclarationField[] {
         DeclarationField.MixedRealEstateType,
         DeclarationField.MixedRealEstateSquare,
         DeclarationField.MixedRealEstateCountry
     }
         );
 }
Example #10
0
 static void FixMissingSubheadersForOwnedColumn(IAdapter adapter, TableHeader columnOrdering)
 {
     //see niz_kam.docx   in tests
     FixMissingSubheadersForMergedColumns(
         adapter,
         columnOrdering,
         DeclarationField.OwnedColumnWithNaturalText,
         new DeclarationField[] {
         DeclarationField.OwnedRealEstateType,
         DeclarationField.OwnedRealEstateSquare,
         DeclarationField.OwnedRealEstateCountry,
         DeclarationField.Vehicle
     }
         );
 }
Example #11
0
 static void FixBadColumnName01_Template(TableHeader c, DeclarationField naturalText, DeclarationField country, DeclarationField square, DeclarationField type)
 {
     //move MixedColumnWithNaturalText  to MixedRealEstateType
     if (!c.ContainsField(naturalText))
     {
         return;
     }
     if (c.ContainsField(country) &&
         c.ContainsField(square)
         )
     {
         TColumnInfo s = c.ColumnOrder[naturalText];
         s.Field = type;
         c.Add(s);
         c.Delete(naturalText);
     }
 }
Example #12
0
        static Dictionary <DeclarationField, Cell> MapByOrderAndIntersection(TableHeader columnOrdering, List <Cell> cells)
        {
            if (columnOrdering.MergedColumnOrder.Count != cells.Count)
            {
                return(null);
            }
            int start           = cells[0].AdditTableIndention;
            var res             = new Dictionary <DeclarationField, Cell>();
            int pixelErrorCount = 0;

            for (int i = 0; i < cells.Count; i++)
            {
                int s1      = start;
                int e1      = start + cells[i].CellWidth;
                var colInfo = columnOrdering.MergedColumnOrder[i];
                int s2      = colInfo.ColumnPixelStart;
                int e2      = colInfo.ColumnPixelStart + colInfo.ColumnPixelWidth;
                if (TableHeader.PeriodIntersection(s1, e1, s2, e2) == 0)
                {
                    pixelErrorCount += 1;
                    if (!DataHelper.IsEmptyValue(cells[i].Text))
                    {
                        if (!ColumnByDataPredictor.TestFieldWithoutOwntypes(colInfo.Field, cells[i]))
                        {
                            Logger.Debug(string.Format("cannot map column N={0} text={1}", i, cells[i].Text.Replace("\n", "\\n")));
                            return(null);
                        }
                        else
                        {
                            Logger.Debug(string.Format("found semantic argument for mapping N={0} text={1} to {2}",
                                                       i, cells[i].Text.Replace("\n", "\\n"), colInfo.Field));
                            pixelErrorCount = 0;
                        }
                    }
                }
                res[columnOrdering.MergedColumnOrder[i].Field] = cells[i];

                start = e1;
            }
            if (pixelErrorCount >= 3)
            {
                return(null);
            }
            return(res);
        }
        public static void UpdateByRow(TableHeader columnOrdering, DataRow row)
        {
            // otherwize nowhere to write
            Debug.Assert(ColumnByDataPredictor.ExternalFileName != null);

            foreach (var i in columnOrdering.MergedColumnOrder)
            {
                try
                {
                    var cell = row.GetDeclarationField(i.Field);
                    var s    = (cell == null) ? "" : cell.GetText();
                    IncrementTrigrams(i.Field, s);
                }
                catch (Exception)
                {
                }
            }
        }
Example #14
0
 static void FixBadColumnName01(TableHeader c)
 {
     FixBadColumnName01_Template(c,
                                 DeclarationField.MixedColumnWithNaturalText,
                                 DeclarationField.MixedRealEstateCountry,
                                 DeclarationField.MixedRealEstateSquare,
                                 DeclarationField.MixedRealEstateType);
     FixBadColumnName01_Template(c,
                                 DeclarationField.StateColumnWithNaturalText,
                                 DeclarationField.StatePropertyCountry,
                                 DeclarationField.StatePropertySquare,
                                 DeclarationField.StatePropertyType);
     FixBadColumnName01_Template(c,
                                 DeclarationField.OwnedColumnWithNaturalText,
                                 DeclarationField.OwnedRealEstateCountry,
                                 DeclarationField.OwnedRealEstateSquare,
                                 DeclarationField.OwnedRealEstateType);
 }
Example #15
0
        static void FixMissingSubheadersForVehicle(IAdapter adapter, TableHeader columnOrdering)
        {
            if (!columnOrdering.ContainsField(DeclarationField.Vehicle))
            {
                return;
            }

            TColumnInfo dummy;
            var         headerCell = adapter.GetDeclarationFieldWeak(columnOrdering, columnOrdering.HeaderBegin.Value, DeclarationField.Vehicle, out dummy);

            if (headerCell.MergedColsCount != 2)
            {
                return;
            }

            var subCells = FindSubcellsUnder(adapter, headerCell);

            if (subCells.Count == 1)
            {
                return;
            }

            string cleanHeader = headerCell.Text.ToLower().Replace(" ", "");

            if (cleanHeader.Contains("транспортныесредства") && cleanHeader.Contains("марка") && cleanHeader.Contains("вид"))
            {
                TColumnInfo columnVehicleType = new TColumnInfo();
                columnVehicleType.BeginColumn      = headerCell.Col;
                columnVehicleType.EndColumn        = headerCell.Col + 1;
                columnVehicleType.ColumnPixelWidth = headerCell.CellWidth / 2;
                columnVehicleType.Field            = DeclarationField.VehicleType;
                columnOrdering.Add(columnVehicleType);

                TColumnInfo columnVehicleModel = new TColumnInfo();
                columnVehicleModel.BeginColumn      = headerCell.Col + 1;
                columnVehicleModel.EndColumn        = headerCell.Col + 2;
                columnVehicleModel.ColumnPixelWidth = headerCell.CellWidth / 2;
                columnVehicleModel.Field            = DeclarationField.VehicleModel;
                columnOrdering.Add(columnVehicleModel);

                columnOrdering.Delete(DeclarationField.Vehicle);
            }
        }
Example #16
0
 bool IsHeaderRow(DataRow row, out TableHeader columnOrdering)
 {
     columnOrdering = null;
     if (!TableHeaderRecognizer.WeakHeaderCheck(Adapter, row.Cells))
     {
         return(false);
     }
     try
     {
         columnOrdering = new TableHeader();
         TableHeaderRecognizer.ReadHeader(Adapter, row.GetRowIndex(), columnOrdering);
         return(true);
     }
     catch (Exception e)
     {
         Logger.Debug(String.Format("Cannot parse possible header, row={0}, error={1}, so skip it may be it is a data row ", e.ToString(), row.GetRowIndex()));
     }
     return(false);
 }
Example #17
0
        public TJsonTablePortion TablePortionToJson(TableHeader columnOrdering, int body_start, int body_end)
        {
            var table = new TJsonTablePortion();

            table.DataStart = body_start;
            int headerEnd = columnOrdering.GetPossibleHeaderEnd();

            for (int i = columnOrdering.GetPossibleHeaderBegin(); i < columnOrdering.GetPossibleHeaderEnd(); i++)
            {
                var row = GetJsonByRow(GetDataCells(i));
                table.Header.Add(row);
            }

            // find section before data
            for (int i = body_start; i >= headerEnd; i--)
            {
                string dummy;
                // cannot use prevRowIsSection
                var row = GetDataCells(i);
                if (IsSectionRow(i, row, columnOrdering.GetMaxColumnEndIndex(), false, out dummy))
                {
                    table.Section.Add(GetJsonByRow(row));
                    break;
                }
            }

            int maxRowsCount = body_end - body_start;

            table.DataEnd = body_start;
            int addedRows = 0;

            while (table.DataEnd < GetRowsCount() && addedRows < maxRowsCount)
            {
                if (!IsEmptyRow(table.DataEnd))
                {
                    table.Data.Add(GetJsonByRow(GetDataCells(table.DataEnd)));
                    addedRows++;
                }
                table.DataEnd++;
            }
            return(table);
        }
Example #18
0
        static void AddColumn(TableHeader ordering, DeclarationField field, Cell cell)
        {
            TColumnInfo s = new TColumnInfo();

            s.BeginColumn      = cell.Col;
            s.EndColumn        = cell.Col + cell.MergedColsCount;
            s.ColumnPixelWidth = cell.CellWidth;
            //s.ColumnPixelStart is unknown and initialized in FinishOrderingBuilding
            s.Field = field;
            if (IsIncomeColumn(field))
            {
                string dummy = "";
                int?   year  = null;
                if (TableHeaderRecognizer.GetValuesFromTitle(cell.GetText(), ref dummy, ref year, ref dummy) && year.HasValue)
                {
                    ordering.YearFromIncome = year.Value;
                }
            }

            ordering.Add(s);
        }
Example #19
0
        public Declaration Parse(TableHeader columnOrdering, bool updateTrigrams, int?documentfile_id)
        {
            var           firstPassStartTime = DateTime.Now;
            Declaration   declaration        = InitializeDeclaration(Adapter, columnOrdering, documentfile_id);
            TBorderFinder borderFinder       = new TBorderFinder(Adapter, declaration, FailOnRelativeOrphan);

            borderFinder.FindBordersAndPersonNames(columnOrdering, updateTrigrams);
            if (TableHeader.SearchForFioColumnOnly)
            {
                return(declaration);
            }
            TSecondPassParser secondPassParser = new TSecondPassParser(Adapter);
            int    declarantCount = secondPassParser.ParseDeclarants(declaration);
            double seconds        = DateTime.Now.Subtract(firstPassStartTime).TotalSeconds;

            Logger.Info("Final Rate: {0:0.00} declarant in second", declarantCount / seconds);
            double total_seconds = DateTime.Now.Subtract(firstPassStartTime).TotalSeconds;

            Logger.Info("Total time: {0:0.00} seconds", total_seconds);
            return(declaration);
        }
Example #20
0
        static public Declaration InitializeDeclaration(IAdapter adapter, TableHeader columnOrdering, int?user_documentfile_id)
        {
            // parse filename
            int?   documentfile_id;
            string archive;
            bool   result = DataHelper.ParseDocumentFileName(adapter.DocumentFile, out documentfile_id, out archive);

            if (user_documentfile_id.HasValue)
            {
                documentfile_id = user_documentfile_id;
            }

            DeclarationProperties properties = new DeclarationProperties()
            {
                SheetTitle      = columnOrdering.Title,
                Year            = columnOrdering.Year,
                DocumentFileId  = documentfile_id,
                ArchiveFileName = archive,
                SheetNumber     = adapter.GetWorksheetIndex(),
                DocumentUrl     = adapter.GetDocumentUrlFromMetaTag()
            };

            if (columnOrdering.YearFromIncome != null)
            {
                if (properties.Year != null)
                {
                    properties.Year = Math.Max(columnOrdering.YearFromIncome.Value, properties.Year.Value);
                }
                else
                {
                    properties.Year = columnOrdering.YearFromIncome;
                }
            }
            Declaration declaration = new Declaration()
            {
                Properties = properties
            };

            return(declaration);
        }
Example #21
0
        // напрямую используется, пока ColumnOrdering еще не построен
        // во всех остальных случаях надо использовать Row.GetDeclarationField
        virtual public Cell GetDeclarationFieldWeak(TableHeader columnOrdering, int row, DeclarationField field, out TColumnInfo colSpan)
        {
            if (!columnOrdering.ColumnOrder.TryGetValue(field, out colSpan))
            {
                throw new SmartParserFieldNotFoundException(String.Format("Field {0} not found, row={1}", field.ToString(), row));
            }

            var exactCell = GetCell(row, colSpan.BeginColumn);

            if (exactCell == null)
            {
                var cells = GetDataCells(row);

                throw new SmartParserFieldNotFoundException(String.Format("Field {0} not found, row={1}, col={2}. Row.Cells.Count = {3}",
                                                                          field.ToString(),
                                                                          row,
                                                                          colSpan.BeginColumn,
                                                                          cells.Count
                                                                          ));
            }
            return(exactCell);
        }
Example #22
0
        static Dictionary <DeclarationField, Cell> MapByMaxIntersection(TableHeader columnOrdering, List <Cell> cells)
        {
            Logger.Debug("MapByMaxIntersection");
            // map two header cells to one data cell
            // see dnko-2014.docx for an example

            var res   = new Dictionary <DeclarationField, Cell>();
            var sizes = new Dictionary <DeclarationField, int>();

            if (cells.Count == 0)
            {
                return(res);
            }
            int start = cells[0].AdditTableIndention;

            foreach (var c in cells)
            {
                if (c.CellWidth > 0)
                {
                    int interSize = 0;
                    var field     = columnOrdering.FindByPixelIntersection(start, start + c.CellWidth, out interSize);

                    // cannot map some text,so it is a failure
                    if (field == DeclarationField.None && c.Text.Trim().Length > 0)
                    {
                        return(null);
                    }
                    // take only fields with maximal pixel intersection
                    if (!sizes.ContainsKey(field) || sizes[field] < interSize)
                    {
                        //Logger.Debug(string.Format("map {1} to {0}", field, c.Text.Replace("\n", "\\n")));
                        res[field]   = c;
                        sizes[field] = interSize;
                    }
                }
                start += c.CellWidth;
            }
            return(res);
        }
Example #23
0
        static void FixMissingSubheadersForMergedColumns(IAdapter adapter, TableHeader columnOrdering,
                                                         DeclarationField mergedField, DeclarationField[] subColumns)
        {
            if (!columnOrdering.ContainsField(mergedField))
            {
                return;
            }
            TColumnInfo dummy;
            var         headerCell = adapter.GetDeclarationFieldWeak(columnOrdering, columnOrdering.HeaderBegin.Value, mergedField, out dummy);
            var         subCells   = FindSubcellsUnder(adapter, headerCell);

            // we check only the  second column, todo check the  first one and  the third
            if (subCells.Count != subColumns.Count() || !CheckSquareColumn(adapter, columnOrdering.FirstDataRow, 5, subCells, 1))
            {
                return;
            }
            for (int i = 0; i < subColumns.Count(); ++i)
            {
                AddColumn(columnOrdering, subColumns[i], subCells[i]);
            }
            columnOrdering.Delete(mergedField);
        }
Example #24
0
        static public void ReadHeader(IAdapter adapter, int headerStartRow, TableHeader columnOrdering)
        {
            int headerEndRow;
            var cells = GetColumnCells(adapter, headerStartRow, out headerEndRow);

            MapColumnTitlesToInnerConstants(adapter, cells, columnOrdering);

            columnOrdering.HeaderBegin = headerStartRow;
            columnOrdering.HeaderEnd   = headerEndRow;
            int firstDataRow = columnOrdering.HeaderEnd.Value;

            // пропускаем колонку с номерами
            if (firstDataRow < adapter.GetRowsCount())
            {
                string cellText1 = adapter.GetCell(firstDataRow, 0).GetText();
                string cellText2 = adapter.GetCell(firstDataRow, 1).GetText();
                if (cellText1.StartsWith("1") && cellText2.StartsWith("2"))
                {
                    firstDataRow++;
                }
            }

            columnOrdering.FirstDataRow = firstDataRow;

            if (columnOrdering.ColumnOrder.Count() == 0)
            {
                throw new SmartParserException("cannot find headers");
            }
            FixMissingSubheadersForMixedRealEstate(adapter, columnOrdering);
            FixMissingSubheadersForVehicle(adapter, columnOrdering);
            FixBadColumnName01(columnOrdering);
            FixBadColumnName02(columnOrdering);
            FixMissingSubheadersForOwnedColumn(adapter, columnOrdering);
            FixMissingSubheadersForStateColumn(adapter, columnOrdering);
            columnOrdering.FinishOrderingBuilding(cells[0].AdditTableIndention);
        }
Example #25
0
        //  see 8562.pdf.docx  in tests
        //  calc string width using graphics.MeasureString methods
        bool DivideDeclarantAndRelativesBySoftEolns(TableHeader columnOrdering, DataRow row)
        {
            if (CurrentDeclarant.Relatives.Count() > 0)
            {
                return(false);
            }
            if (!columnOrdering.ContainsField(DeclarationField.NameOrRelativeType))
            {
                return(false);
            }
            Cell nameCell = row.GetDeclarationField(DeclarationField.NameOrRelativeType);

            if (!(nameCell is OpenXmlWordCell) && !(nameCell is HtmlAdapterCell))
            {
                return(false);
            }
            if (nameCell is null)
            {
                return(false);
            }
            if (nameCell.IsEmpty)
            {
                return(false);
            }
            if (row.adapter.IsExcel())
            {
                return(false);                       // no font info
            }
            List <string> lines = GetLinesWithSoftBreaks(nameCell);

            if (lines.Count < 2)
            {
                return(false);
            }
            List <int> borders = new List <int>()
            {
                0
            };

            for (int i = 1; i < lines.Count; ++i)
            {
                if (DataHelper.ParseRelationType(lines[i], false) != RelationType.Error)
                {
                    borders.Add(i);
                }
            }
            if (borders.Count == 1)
            {
                return(false);
            }
            List <DataRow> dividedLines = new List <DataRow>();

            for (int i = 0; i < borders.Count; ++i)
            {
                dividedLines.Add(row.DeepClone());
            }
            for (int i = 0; i < row.Cells.Count; ++i)
            {
                DivideCell(row, i, borders, dividedLines);
            }
            for (int k = 0; k < borders.Count; ++k)
            {
                if (!DividedLinesToDataRows(row, dividedLines, k))
                {
                    return(false);
                }
            }
            Logger.Debug(String.Format("Divide line to {0} parts", borders.Count()));
            return(true);
        }
Example #26
0
        static Declaration BuildDeclarations(IAdapter adapter, string inputFile)
        {
            Declaration declaration;
            string      inputFileName = Path.GetFileName(inputFile);

            SmartParser.Lib.Parser parser = new SmartParser.Lib.Parser(adapter, !SkipRelativeOrphan);

            if (adapter.CurrentScheme == default)
            {
                SmartParser.Lib.TableHeader?columnOrdering = null;
                try
                {
                    columnOrdering   = TableHeaderRecognizer.ExamineTableBeginning(adapter);
                    LastGoodOrdering = columnOrdering;
                }
                catch (Exception ex)
                {
                    Logger.Info(ex.Message);
                    if (LastGoodOrdering != null)
                    {
                        Logger.Info("use the last known table header scheme");
                        columnOrdering = LastGoodOrdering;
                        columnOrdering.FirstDataRow = 0;
                    }
                    else
                    {
                        throw ex;
                    }
                }

                // Try to extract declaration year from file name if we weren't able to get it from document title
                if (!columnOrdering.Year.HasValue)
                {
                    columnOrdering.Year = TextHelpers.ExtractYear(inputFileName);
                }

                Logger.Info("Column ordering: ");
                foreach (var ordering in columnOrdering.ColumnOrder)
                {
                    Logger.Info(ordering.ToString());
                }

                Logger.Info(String.Format("OwnershipTypeInSeparateField: {0}",
                                          columnOrdering.OwnershipTypeInSeparateField));

                if (ColumnsOnly)
                {
                    return(null);
                }

                if (ColumnToDump != DeclarationField.None)
                {
                    DumpColumn(adapter, columnOrdering, ColumnToDump);
                    return(null);
                }

                if (columnOrdering.Title != null)
                {
                    Logger.Info("Declaration Title: {0} ", columnOrdering.Title);
                }

                if (columnOrdering.Year != null)
                {
                    Logger.Info("Declaration Year: {0} ", columnOrdering.Year.Value);
                }

                if (columnOrdering.MinistryName != null)
                {
                    Logger.Info("Declaration Ministry: {0} ", columnOrdering.MinistryName);
                }


                if (!columnOrdering.HasNameColumn())
                {
                    // TODO сначала поискать первый section_row и проверить, именно там может быть ФИО
                    // https://declarator.org/admin/declarations/jsonfile/186842/change/
                    throw new SmartParserException("Insufficient fields: No any of Declarant Name fields found.");
                }

                if (!(columnOrdering.ContainsField(DeclarationField.DeclarantIncome) ||
                      columnOrdering.ContainsField(DeclarationField.DeclarantIncomeInThousands) ||
                      columnOrdering.ContainsField(DeclarationField.DeclaredYearlyIncome) ||
                      columnOrdering.ContainsField(DeclarationField.DeclaredYearlyIncomeThousands)))
                {
                    if (!SmartParser.Lib.TableHeader.SearchForFioColumnOnly)
                    {
                        throw new SmartParserException("Insufficient fields: No any of Declarant Income fields found.");
                    }
                }

                declaration = parser.Parse(columnOrdering, BuildTrigrams, UserDocumentFileId);
                SaveRandomPortionToToloka(adapter, columnOrdering, declaration, inputFile);
            }
            else
            {
                declaration = adapter.CurrentScheme.Parse(parser, UserDocumentFileId);
            }
            return(declaration);
        }
Example #27
0
        public void FindBordersAndPersonNames(TableHeader columnOrdering, bool updateTrigrams)
        {
            int rowOffset = columnOrdering.FirstDataRow;

            if (columnOrdering.Section != null)
            {
                CreateNewSection(rowOffset, columnOrdering.Section);
            }

            bool   skipEmptyPerson = false;
            string prevPersonName  = "";

            for (int row = rowOffset; row < Adapter.GetRowsCount(); row++)
            {
                DataRow currRow = Adapter.GetRow(columnOrdering, row);
                if (currRow == null || currRow.IsEmpty())
                {
                    continue;
                }
                if (IAdapter.IsNumbersRow(currRow.Cells))
                {
                    continue;
                }
                Logger.Debug(String.Format("currRow {0}, col_count={1}: {2}", row, currRow.Cells.Count, currRow.DebugString()));

                string sectionName;
                if (Adapter.IsSectionRow(row, currRow.Cells, columnOrdering.GetMaxColumnEndIndex(), false, out sectionName))
                {
                    CreateNewSection(row, sectionName);
                    continue;
                }
                {
                    TableHeader newColumnOrdering;
                    if (IsHeaderRow(currRow, out newColumnOrdering))
                    {
                        columnOrdering = newColumnOrdering;
                        Logger.Debug(String.Format("found a new table header {0}", currRow.DebugString()));
                        row = newColumnOrdering.GetPossibleHeaderEnd() - 1; // row++ in "for" cycle
                        continue;
                    }
                }

                if (updateTrigrams)
                {
                    ColumnByDataPredictor.UpdateByRow(columnOrdering, currRow);
                }

                if (!currRow.InitPersonData(prevPersonName))
                {
                    // be robust, ignore errors see 8562.pdf.docx in tests
                    continue;
                }

                if (currRow.PersonName != String.Empty)
                {
                    prevPersonName = currRow.PersonName;
                    CreateNewDeclarant(Adapter, currRow);
                    if (CurrentPerson != null)
                    {
                        skipEmptyPerson = false;
                    }
                }
                else if (currRow.RelativeType != String.Empty)
                {
                    if (!skipEmptyPerson)
                    {
                        try
                        {
                            CreateNewRelative(currRow);
                        }
                        catch (SmartParserRelativeWithoutPersonException e)
                        {
                            skipEmptyPerson = true;
                            Logger.Error(e.Message);
                            continue;
                        }
                    }
                }
                else
                {
                    if (CurrentPerson == null && FailOnRelativeOrphan)
                    {
                        skipEmptyPerson = true;
                        Logger.Error(String.Format("No person to attach info on row={0}", row));
                        continue;
                    }
                }
                if (!skipEmptyPerson)
                {
                    AddInputRowToCurrentPerson(columnOrdering, currRow);
                    if (_Declaration.Properties.Year == null && columnOrdering.ContainsField(DeclarationField.IncomeYear))
                    {
                        var incomeYear = currRow.GetDeclarationField(DeclarationField.IncomeYear);
                        if (incomeYear != null)
                        {
                            _Declaration.Properties.Year = int.Parse(incomeYear.Text);
                        }
                    }
                }
            }
            if (updateTrigrams)
            {
                ColumnByDataPredictor.WriteData();
            }

            Logger.Info("Parsed {0} declarants", _Declaration.PublicServants.Count());
        }
Example #28
0
 public DataRow GetRow(TableHeader columnOrdering, int row)
 {
     return(new DataRow(this, columnOrdering, row));
 }
Example #29
0
        public override Declaration Parse(Parser parser, int?userDocumentFileId)
        {
            InitializeEP();

            TableHeader columnOrdering = new TableHeader();
            var         declaration    = Parser.InitializeDeclaration(parser.Adapter, columnOrdering, userDocumentFileId);

            declaration.Properties.Year       = GetYear();
            declaration.Properties.SheetTitle = FindTitleAboveTheTable();

            var currentDeclarant = CreatePublicServant(columnOrdering);

            declaration.PublicServants.Add(currentDeclarant);

            var tables = Document.Descendants <Table>().ToList();

            string lastTableProcessor = "";
            Table  lastTable;

            foreach (var table in tables)
            {
                var rows = table.Descendants <TableRow>().ToList();
                if (rows.Count == 0)
                {
                    continue;
                }

                var cells         = rows[0].Descendants <TableCell>().ToList();
                var rowText       = rows[0].InnerText.OnlyRussianLowercase();
                var firstCellText = cells[0].InnerText.OnlyRussianLowercase();

                if (firstCellText == "замещаемаядолжность")
                {
                    ProcessPositionTable(table, currentDeclarant);
                    lastTableProcessor = "Position";
                }

                else if (rowText.Contains("ппвиддоходавеличинадоходаруб"))
                {
                    ProcessIncomeTable(table, currentDeclarant);
                    lastTableProcessor = "Income";
                }

                else if (rowText.Contains("ппвидимущества") &&
                         rowText.Contains("собственникимущества"))
                {
                    ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.OwnedString);
                    lastTableProcessor = "RealEstateOwned";
                }

                else if (rowText.Contains("ппвидимущества") &&
                         rowText.Contains("находитсявпользовании"))
                {
                    ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.StateString);
                    lastTableProcessor = "RealEstateState";
                }

                else if (rowText.Contains("видимаркатранспорт") &&
                         rowText.Contains("собственник"))
                {
                    ParseVehicleTable(table, currentDeclarant);
                    lastTableProcessor = "Vehicle";
                }
                else
                {
                    switch (lastTableProcessor)
                    {
                    case "Vehicle": ParseVehicleTable(table, currentDeclarant); break;

                    case "RealEstateState": ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.StateString); break;

                    case "RealEstateOwned": ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.OwnedString); break;
                    }
                }

                lastTable = table;
            }

            return(declaration);
        }
Example #30
0
        static public void MapColumnTitlesToInnerConstants(IAdapter adapter, List <Cell> cells, TableHeader columnOrdering)
        {
            foreach (var cell in cells)
            {
                string text = cell.GetText(true);
                Logger.Debug(string.Format("column title: \"{0}\"[{1}]", text.ReplaceEolnWithSpace().CoalesceWhitespace(), cell.CellWidth));
                DeclarationField field;
                string           clean_text = AbsenceMarkers.Aggregate(text, (x, y) => x.Replace(y, "")).Trim();

                if (adapter.GetRowsCount() == cell.MergedRowsCount)
                {
                    continue;
                }

                if ((text == "" || clean_text.Length <= 1) && (text != "№"))
                {
                    // too short title, try to predict by values
                    field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell);
                    Logger.Debug("Predict: " + field.ToString());
                }
                else
                {
                    field = HeaderHelpers.TryGetField(cell.TextAbove, text);
                    if ((field == DeclarationField.None) && clean_text.Length <= 4)
                    {
                        field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell);
                        Logger.Debug("Predict: " + field.ToString());
                    }
                    if (field == DeclarationField.None)
                    {
                        throw new SmartParserException(String.Format("Cannot recognize field \"{0}\"", text.Replace('\n', ' ')));
                    }
                }

                if (field == DeclarationField.None && !DataHelper.IsEmptyValue(text))
                {
                    throw new ColumnDetectorException(String.Format("Fail to detect column type row: {0} title:{1}", cell.Row, text));
                }
                if (ColumnByDataPredictor.CalcPrecision)
                {
                    ColumnByDataPredictor.PredictForPrecisionCheck(adapter, cell, field);
                }

                AddColumn(columnOrdering, field, cell);
                if (TableHeader.SearchForFioColumnOnly)
                {
                    if (HeaderHelpers.IsNameDeclarationField(field))
                    {
                        break;
                    }
                }
            }
        }