public static void SaveRandomPortionToToloka(IAdapter adapter, SmartParser.Lib.TableHeader columnOrdering, Declaration declaration, string inputFileName) { if (TolokaFileName == "") { return; } string fileID = BuildInputFileId(adapter, inputFileName); using (System.IO.StreamWriter file = new System.IO.StreamWriter(TolokaFileName)) { file.WriteLine("INPUT:input_id\tINPUT:input_json\tGOLDEN:declaration_json\tHINT:text"); Random random = new Random(); int dataRowsCount = Math.Min(20, adapter.GetRowsCount() - columnOrdering.GetPossibleHeaderEnd()); int dataStart = random.Next(columnOrdering.GetPossibleHeaderEnd(), adapter.GetRowsCount() - dataRowsCount); int dataEnd = dataStart + dataRowsCount; var json = adapter.TablePortionToJson(columnOrdering, dataStart, dataEnd); json.InputFileName = inputFileName; json.Title = declaration.Properties.SheetTitle; string jsonStr = JsonConvert.SerializeObject(json); jsonStr = jsonStr.Replace("\t", " ").Replace("\\t", " ").Replace("\"", "\"\""); string id = fileID + "_" + dataStart + "_" + dataEnd; file.WriteLine(id + "\t" + "\"" + jsonStr + "\"\t\t"); } }
static public TableHeader ExamineTableBeginning(IAdapter adapter) { TableHeader columnOrdering = new TableHeader(); int headerStartRow = ProcessTitle(adapter, columnOrdering); headerStartRow = Fix31832(adapter, headerStartRow); ReadHeader(adapter, headerStartRow, columnOrdering); return(columnOrdering); }
private PublicServant CreatePublicServant(TableHeader columnOrdering) { var currentDeclarant = new PublicServant(); currentDeclarant.NameRaw = GetPersonName(); currentDeclarant.Ordering = columnOrdering; currentDeclarant.Index = 1; return(currentDeclarant); }
public DataRow(IAdapter adapter, TableHeader columnOrdering, int row) { this.row = row; this.adapter = adapter; this.ColumnOrdering = columnOrdering; Cells = adapter.GetDataCells(row, columnOrdering.GetMaxColumnEndIndex()); if (!this.adapter.IsExcel()) { MapCells(); } }
public void AddInputRowToCurrentPerson(TableHeader columnOrdering, DataRow row) { if (CurrentPerson != null) { if (!DivideDeclarantAndRelativesBySoftEolns(columnOrdering, row)) { CurrentPerson.DateRows.Add(row); TransposeTableByRelatives(columnOrdering, row); } } }
static void FixBadColumnName02(TableHeader c) { //move NameAndOccupationOrRelativeType to NameOrRelativeType if Occupation is present if (c.ContainsField(DeclarationField.NameAndOccupationOrRelativeType) && c.ContainsField(DeclarationField.Occupation) ) { TColumnInfo s = c.ColumnOrder[DeclarationField.NameAndOccupationOrRelativeType]; s.Field = DeclarationField.NameOrRelativeType; c.Add(s); c.Delete(DeclarationField.NameAndOccupationOrRelativeType); } }
static void DumpColumn(IAdapter adapter, SmartParser.Lib.TableHeader columnOrdering, DeclarationField columnToDump) { int rowOffset = columnOrdering.FirstDataRow; for (var row = rowOffset; row < adapter.GetRowsCount(); row++) { DataRow currRow = adapter.GetRow(columnOrdering, row); var cell = currRow.GetDeclarationField(columnToDump); var s = (cell == null) ? "null" : cell.GetText(); s = s.Replace("\n", "\\n"); Console.WriteLine(s); } }
static void FixMissingSubheadersForStateColumn(IAdapter adapter, TableHeader columnOrdering) { //see niz_kam.docx in tests FixMissingSubheadersForMergedColumns( adapter, columnOrdering, DeclarationField.StateColumnWithNaturalText, new DeclarationField[] { DeclarationField.StatePropertyType, DeclarationField.StatePropertySquare, DeclarationField.StatePropertyCountry, } ); }
static void FixMissingSubheadersForMixedRealEstate(IAdapter adapter, TableHeader columnOrdering) { //see DepEnergo2010.doc in tests FixMissingSubheadersForMergedColumns( adapter, columnOrdering, DeclarationField.MixedColumnWithNaturalText, new DeclarationField[] { DeclarationField.MixedRealEstateType, DeclarationField.MixedRealEstateSquare, DeclarationField.MixedRealEstateCountry } ); }
static void FixMissingSubheadersForOwnedColumn(IAdapter adapter, TableHeader columnOrdering) { //see niz_kam.docx in tests FixMissingSubheadersForMergedColumns( adapter, columnOrdering, DeclarationField.OwnedColumnWithNaturalText, new DeclarationField[] { DeclarationField.OwnedRealEstateType, DeclarationField.OwnedRealEstateSquare, DeclarationField.OwnedRealEstateCountry, DeclarationField.Vehicle } ); }
static void FixBadColumnName01_Template(TableHeader c, DeclarationField naturalText, DeclarationField country, DeclarationField square, DeclarationField type) { //move MixedColumnWithNaturalText to MixedRealEstateType if (!c.ContainsField(naturalText)) { return; } if (c.ContainsField(country) && c.ContainsField(square) ) { TColumnInfo s = c.ColumnOrder[naturalText]; s.Field = type; c.Add(s); c.Delete(naturalText); } }
static Dictionary <DeclarationField, Cell> MapByOrderAndIntersection(TableHeader columnOrdering, List <Cell> cells) { if (columnOrdering.MergedColumnOrder.Count != cells.Count) { return(null); } int start = cells[0].AdditTableIndention; var res = new Dictionary <DeclarationField, Cell>(); int pixelErrorCount = 0; for (int i = 0; i < cells.Count; i++) { int s1 = start; int e1 = start + cells[i].CellWidth; var colInfo = columnOrdering.MergedColumnOrder[i]; int s2 = colInfo.ColumnPixelStart; int e2 = colInfo.ColumnPixelStart + colInfo.ColumnPixelWidth; if (TableHeader.PeriodIntersection(s1, e1, s2, e2) == 0) { pixelErrorCount += 1; if (!DataHelper.IsEmptyValue(cells[i].Text)) { if (!ColumnByDataPredictor.TestFieldWithoutOwntypes(colInfo.Field, cells[i])) { Logger.Debug(string.Format("cannot map column N={0} text={1}", i, cells[i].Text.Replace("\n", "\\n"))); return(null); } else { Logger.Debug(string.Format("found semantic argument for mapping N={0} text={1} to {2}", i, cells[i].Text.Replace("\n", "\\n"), colInfo.Field)); pixelErrorCount = 0; } } } res[columnOrdering.MergedColumnOrder[i].Field] = cells[i]; start = e1; } if (pixelErrorCount >= 3) { return(null); } return(res); }
public static void UpdateByRow(TableHeader columnOrdering, DataRow row) { // otherwize nowhere to write Debug.Assert(ColumnByDataPredictor.ExternalFileName != null); foreach (var i in columnOrdering.MergedColumnOrder) { try { var cell = row.GetDeclarationField(i.Field); var s = (cell == null) ? "" : cell.GetText(); IncrementTrigrams(i.Field, s); } catch (Exception) { } } }
static void FixBadColumnName01(TableHeader c) { FixBadColumnName01_Template(c, DeclarationField.MixedColumnWithNaturalText, DeclarationField.MixedRealEstateCountry, DeclarationField.MixedRealEstateSquare, DeclarationField.MixedRealEstateType); FixBadColumnName01_Template(c, DeclarationField.StateColumnWithNaturalText, DeclarationField.StatePropertyCountry, DeclarationField.StatePropertySquare, DeclarationField.StatePropertyType); FixBadColumnName01_Template(c, DeclarationField.OwnedColumnWithNaturalText, DeclarationField.OwnedRealEstateCountry, DeclarationField.OwnedRealEstateSquare, DeclarationField.OwnedRealEstateType); }
static void FixMissingSubheadersForVehicle(IAdapter adapter, TableHeader columnOrdering) { if (!columnOrdering.ContainsField(DeclarationField.Vehicle)) { return; } TColumnInfo dummy; var headerCell = adapter.GetDeclarationFieldWeak(columnOrdering, columnOrdering.HeaderBegin.Value, DeclarationField.Vehicle, out dummy); if (headerCell.MergedColsCount != 2) { return; } var subCells = FindSubcellsUnder(adapter, headerCell); if (subCells.Count == 1) { return; } string cleanHeader = headerCell.Text.ToLower().Replace(" ", ""); if (cleanHeader.Contains("транспортныесредства") && cleanHeader.Contains("марка") && cleanHeader.Contains("вид")) { TColumnInfo columnVehicleType = new TColumnInfo(); columnVehicleType.BeginColumn = headerCell.Col; columnVehicleType.EndColumn = headerCell.Col + 1; columnVehicleType.ColumnPixelWidth = headerCell.CellWidth / 2; columnVehicleType.Field = DeclarationField.VehicleType; columnOrdering.Add(columnVehicleType); TColumnInfo columnVehicleModel = new TColumnInfo(); columnVehicleModel.BeginColumn = headerCell.Col + 1; columnVehicleModel.EndColumn = headerCell.Col + 2; columnVehicleModel.ColumnPixelWidth = headerCell.CellWidth / 2; columnVehicleModel.Field = DeclarationField.VehicleModel; columnOrdering.Add(columnVehicleModel); columnOrdering.Delete(DeclarationField.Vehicle); } }
bool IsHeaderRow(DataRow row, out TableHeader columnOrdering) { columnOrdering = null; if (!TableHeaderRecognizer.WeakHeaderCheck(Adapter, row.Cells)) { return(false); } try { columnOrdering = new TableHeader(); TableHeaderRecognizer.ReadHeader(Adapter, row.GetRowIndex(), columnOrdering); return(true); } catch (Exception e) { Logger.Debug(String.Format("Cannot parse possible header, row={0}, error={1}, so skip it may be it is a data row ", e.ToString(), row.GetRowIndex())); } return(false); }
public TJsonTablePortion TablePortionToJson(TableHeader columnOrdering, int body_start, int body_end) { var table = new TJsonTablePortion(); table.DataStart = body_start; int headerEnd = columnOrdering.GetPossibleHeaderEnd(); for (int i = columnOrdering.GetPossibleHeaderBegin(); i < columnOrdering.GetPossibleHeaderEnd(); i++) { var row = GetJsonByRow(GetDataCells(i)); table.Header.Add(row); } // find section before data for (int i = body_start; i >= headerEnd; i--) { string dummy; // cannot use prevRowIsSection var row = GetDataCells(i); if (IsSectionRow(i, row, columnOrdering.GetMaxColumnEndIndex(), false, out dummy)) { table.Section.Add(GetJsonByRow(row)); break; } } int maxRowsCount = body_end - body_start; table.DataEnd = body_start; int addedRows = 0; while (table.DataEnd < GetRowsCount() && addedRows < maxRowsCount) { if (!IsEmptyRow(table.DataEnd)) { table.Data.Add(GetJsonByRow(GetDataCells(table.DataEnd))); addedRows++; } table.DataEnd++; } return(table); }
static void AddColumn(TableHeader ordering, DeclarationField field, Cell cell) { TColumnInfo s = new TColumnInfo(); s.BeginColumn = cell.Col; s.EndColumn = cell.Col + cell.MergedColsCount; s.ColumnPixelWidth = cell.CellWidth; //s.ColumnPixelStart is unknown and initialized in FinishOrderingBuilding s.Field = field; if (IsIncomeColumn(field)) { string dummy = ""; int? year = null; if (TableHeaderRecognizer.GetValuesFromTitle(cell.GetText(), ref dummy, ref year, ref dummy) && year.HasValue) { ordering.YearFromIncome = year.Value; } } ordering.Add(s); }
public Declaration Parse(TableHeader columnOrdering, bool updateTrigrams, int?documentfile_id) { var firstPassStartTime = DateTime.Now; Declaration declaration = InitializeDeclaration(Adapter, columnOrdering, documentfile_id); TBorderFinder borderFinder = new TBorderFinder(Adapter, declaration, FailOnRelativeOrphan); borderFinder.FindBordersAndPersonNames(columnOrdering, updateTrigrams); if (TableHeader.SearchForFioColumnOnly) { return(declaration); } TSecondPassParser secondPassParser = new TSecondPassParser(Adapter); int declarantCount = secondPassParser.ParseDeclarants(declaration); double seconds = DateTime.Now.Subtract(firstPassStartTime).TotalSeconds; Logger.Info("Final Rate: {0:0.00} declarant in second", declarantCount / seconds); double total_seconds = DateTime.Now.Subtract(firstPassStartTime).TotalSeconds; Logger.Info("Total time: {0:0.00} seconds", total_seconds); return(declaration); }
static public Declaration InitializeDeclaration(IAdapter adapter, TableHeader columnOrdering, int?user_documentfile_id) { // parse filename int? documentfile_id; string archive; bool result = DataHelper.ParseDocumentFileName(adapter.DocumentFile, out documentfile_id, out archive); if (user_documentfile_id.HasValue) { documentfile_id = user_documentfile_id; } DeclarationProperties properties = new DeclarationProperties() { SheetTitle = columnOrdering.Title, Year = columnOrdering.Year, DocumentFileId = documentfile_id, ArchiveFileName = archive, SheetNumber = adapter.GetWorksheetIndex(), DocumentUrl = adapter.GetDocumentUrlFromMetaTag() }; if (columnOrdering.YearFromIncome != null) { if (properties.Year != null) { properties.Year = Math.Max(columnOrdering.YearFromIncome.Value, properties.Year.Value); } else { properties.Year = columnOrdering.YearFromIncome; } } Declaration declaration = new Declaration() { Properties = properties }; return(declaration); }
// напрямую используется, пока ColumnOrdering еще не построен // во всех остальных случаях надо использовать Row.GetDeclarationField virtual public Cell GetDeclarationFieldWeak(TableHeader columnOrdering, int row, DeclarationField field, out TColumnInfo colSpan) { if (!columnOrdering.ColumnOrder.TryGetValue(field, out colSpan)) { throw new SmartParserFieldNotFoundException(String.Format("Field {0} not found, row={1}", field.ToString(), row)); } var exactCell = GetCell(row, colSpan.BeginColumn); if (exactCell == null) { var cells = GetDataCells(row); throw new SmartParserFieldNotFoundException(String.Format("Field {0} not found, row={1}, col={2}. Row.Cells.Count = {3}", field.ToString(), row, colSpan.BeginColumn, cells.Count )); } return(exactCell); }
static Dictionary <DeclarationField, Cell> MapByMaxIntersection(TableHeader columnOrdering, List <Cell> cells) { Logger.Debug("MapByMaxIntersection"); // map two header cells to one data cell // see dnko-2014.docx for an example var res = new Dictionary <DeclarationField, Cell>(); var sizes = new Dictionary <DeclarationField, int>(); if (cells.Count == 0) { return(res); } int start = cells[0].AdditTableIndention; foreach (var c in cells) { if (c.CellWidth > 0) { int interSize = 0; var field = columnOrdering.FindByPixelIntersection(start, start + c.CellWidth, out interSize); // cannot map some text,so it is a failure if (field == DeclarationField.None && c.Text.Trim().Length > 0) { return(null); } // take only fields with maximal pixel intersection if (!sizes.ContainsKey(field) || sizes[field] < interSize) { //Logger.Debug(string.Format("map {1} to {0}", field, c.Text.Replace("\n", "\\n"))); res[field] = c; sizes[field] = interSize; } } start += c.CellWidth; } return(res); }
static void FixMissingSubheadersForMergedColumns(IAdapter adapter, TableHeader columnOrdering, DeclarationField mergedField, DeclarationField[] subColumns) { if (!columnOrdering.ContainsField(mergedField)) { return; } TColumnInfo dummy; var headerCell = adapter.GetDeclarationFieldWeak(columnOrdering, columnOrdering.HeaderBegin.Value, mergedField, out dummy); var subCells = FindSubcellsUnder(adapter, headerCell); // we check only the second column, todo check the first one and the third if (subCells.Count != subColumns.Count() || !CheckSquareColumn(adapter, columnOrdering.FirstDataRow, 5, subCells, 1)) { return; } for (int i = 0; i < subColumns.Count(); ++i) { AddColumn(columnOrdering, subColumns[i], subCells[i]); } columnOrdering.Delete(mergedField); }
static public void ReadHeader(IAdapter adapter, int headerStartRow, TableHeader columnOrdering) { int headerEndRow; var cells = GetColumnCells(adapter, headerStartRow, out headerEndRow); MapColumnTitlesToInnerConstants(adapter, cells, columnOrdering); columnOrdering.HeaderBegin = headerStartRow; columnOrdering.HeaderEnd = headerEndRow; int firstDataRow = columnOrdering.HeaderEnd.Value; // пропускаем колонку с номерами if (firstDataRow < adapter.GetRowsCount()) { string cellText1 = adapter.GetCell(firstDataRow, 0).GetText(); string cellText2 = adapter.GetCell(firstDataRow, 1).GetText(); if (cellText1.StartsWith("1") && cellText2.StartsWith("2")) { firstDataRow++; } } columnOrdering.FirstDataRow = firstDataRow; if (columnOrdering.ColumnOrder.Count() == 0) { throw new SmartParserException("cannot find headers"); } FixMissingSubheadersForMixedRealEstate(adapter, columnOrdering); FixMissingSubheadersForVehicle(adapter, columnOrdering); FixBadColumnName01(columnOrdering); FixBadColumnName02(columnOrdering); FixMissingSubheadersForOwnedColumn(adapter, columnOrdering); FixMissingSubheadersForStateColumn(adapter, columnOrdering); columnOrdering.FinishOrderingBuilding(cells[0].AdditTableIndention); }
// see 8562.pdf.docx in tests // calc string width using graphics.MeasureString methods bool DivideDeclarantAndRelativesBySoftEolns(TableHeader columnOrdering, DataRow row) { if (CurrentDeclarant.Relatives.Count() > 0) { return(false); } if (!columnOrdering.ContainsField(DeclarationField.NameOrRelativeType)) { return(false); } Cell nameCell = row.GetDeclarationField(DeclarationField.NameOrRelativeType); if (!(nameCell is OpenXmlWordCell) && !(nameCell is HtmlAdapterCell)) { return(false); } if (nameCell is null) { return(false); } if (nameCell.IsEmpty) { return(false); } if (row.adapter.IsExcel()) { return(false); // no font info } List <string> lines = GetLinesWithSoftBreaks(nameCell); if (lines.Count < 2) { return(false); } List <int> borders = new List <int>() { 0 }; for (int i = 1; i < lines.Count; ++i) { if (DataHelper.ParseRelationType(lines[i], false) != RelationType.Error) { borders.Add(i); } } if (borders.Count == 1) { return(false); } List <DataRow> dividedLines = new List <DataRow>(); for (int i = 0; i < borders.Count; ++i) { dividedLines.Add(row.DeepClone()); } for (int i = 0; i < row.Cells.Count; ++i) { DivideCell(row, i, borders, dividedLines); } for (int k = 0; k < borders.Count; ++k) { if (!DividedLinesToDataRows(row, dividedLines, k)) { return(false); } } Logger.Debug(String.Format("Divide line to {0} parts", borders.Count())); return(true); }
static Declaration BuildDeclarations(IAdapter adapter, string inputFile) { Declaration declaration; string inputFileName = Path.GetFileName(inputFile); SmartParser.Lib.Parser parser = new SmartParser.Lib.Parser(adapter, !SkipRelativeOrphan); if (adapter.CurrentScheme == default) { SmartParser.Lib.TableHeader?columnOrdering = null; try { columnOrdering = TableHeaderRecognizer.ExamineTableBeginning(adapter); LastGoodOrdering = columnOrdering; } catch (Exception ex) { Logger.Info(ex.Message); if (LastGoodOrdering != null) { Logger.Info("use the last known table header scheme"); columnOrdering = LastGoodOrdering; columnOrdering.FirstDataRow = 0; } else { throw ex; } } // Try to extract declaration year from file name if we weren't able to get it from document title if (!columnOrdering.Year.HasValue) { columnOrdering.Year = TextHelpers.ExtractYear(inputFileName); } Logger.Info("Column ordering: "); foreach (var ordering in columnOrdering.ColumnOrder) { Logger.Info(ordering.ToString()); } Logger.Info(String.Format("OwnershipTypeInSeparateField: {0}", columnOrdering.OwnershipTypeInSeparateField)); if (ColumnsOnly) { return(null); } if (ColumnToDump != DeclarationField.None) { DumpColumn(adapter, columnOrdering, ColumnToDump); return(null); } if (columnOrdering.Title != null) { Logger.Info("Declaration Title: {0} ", columnOrdering.Title); } if (columnOrdering.Year != null) { Logger.Info("Declaration Year: {0} ", columnOrdering.Year.Value); } if (columnOrdering.MinistryName != null) { Logger.Info("Declaration Ministry: {0} ", columnOrdering.MinistryName); } if (!columnOrdering.HasNameColumn()) { // TODO сначала поискать первый section_row и проверить, именно там может быть ФИО // https://declarator.org/admin/declarations/jsonfile/186842/change/ throw new SmartParserException("Insufficient fields: No any of Declarant Name fields found."); } if (!(columnOrdering.ContainsField(DeclarationField.DeclarantIncome) || columnOrdering.ContainsField(DeclarationField.DeclarantIncomeInThousands) || columnOrdering.ContainsField(DeclarationField.DeclaredYearlyIncome) || columnOrdering.ContainsField(DeclarationField.DeclaredYearlyIncomeThousands))) { if (!SmartParser.Lib.TableHeader.SearchForFioColumnOnly) { throw new SmartParserException("Insufficient fields: No any of Declarant Income fields found."); } } declaration = parser.Parse(columnOrdering, BuildTrigrams, UserDocumentFileId); SaveRandomPortionToToloka(adapter, columnOrdering, declaration, inputFile); } else { declaration = adapter.CurrentScheme.Parse(parser, UserDocumentFileId); } return(declaration); }
public void FindBordersAndPersonNames(TableHeader columnOrdering, bool updateTrigrams) { int rowOffset = columnOrdering.FirstDataRow; if (columnOrdering.Section != null) { CreateNewSection(rowOffset, columnOrdering.Section); } bool skipEmptyPerson = false; string prevPersonName = ""; for (int row = rowOffset; row < Adapter.GetRowsCount(); row++) { DataRow currRow = Adapter.GetRow(columnOrdering, row); if (currRow == null || currRow.IsEmpty()) { continue; } if (IAdapter.IsNumbersRow(currRow.Cells)) { continue; } Logger.Debug(String.Format("currRow {0}, col_count={1}: {2}", row, currRow.Cells.Count, currRow.DebugString())); string sectionName; if (Adapter.IsSectionRow(row, currRow.Cells, columnOrdering.GetMaxColumnEndIndex(), false, out sectionName)) { CreateNewSection(row, sectionName); continue; } { TableHeader newColumnOrdering; if (IsHeaderRow(currRow, out newColumnOrdering)) { columnOrdering = newColumnOrdering; Logger.Debug(String.Format("found a new table header {0}", currRow.DebugString())); row = newColumnOrdering.GetPossibleHeaderEnd() - 1; // row++ in "for" cycle continue; } } if (updateTrigrams) { ColumnByDataPredictor.UpdateByRow(columnOrdering, currRow); } if (!currRow.InitPersonData(prevPersonName)) { // be robust, ignore errors see 8562.pdf.docx in tests continue; } if (currRow.PersonName != String.Empty) { prevPersonName = currRow.PersonName; CreateNewDeclarant(Adapter, currRow); if (CurrentPerson != null) { skipEmptyPerson = false; } } else if (currRow.RelativeType != String.Empty) { if (!skipEmptyPerson) { try { CreateNewRelative(currRow); } catch (SmartParserRelativeWithoutPersonException e) { skipEmptyPerson = true; Logger.Error(e.Message); continue; } } } else { if (CurrentPerson == null && FailOnRelativeOrphan) { skipEmptyPerson = true; Logger.Error(String.Format("No person to attach info on row={0}", row)); continue; } } if (!skipEmptyPerson) { AddInputRowToCurrentPerson(columnOrdering, currRow); if (_Declaration.Properties.Year == null && columnOrdering.ContainsField(DeclarationField.IncomeYear)) { var incomeYear = currRow.GetDeclarationField(DeclarationField.IncomeYear); if (incomeYear != null) { _Declaration.Properties.Year = int.Parse(incomeYear.Text); } } } } if (updateTrigrams) { ColumnByDataPredictor.WriteData(); } Logger.Info("Parsed {0} declarants", _Declaration.PublicServants.Count()); }
public DataRow GetRow(TableHeader columnOrdering, int row) { return(new DataRow(this, columnOrdering, row)); }
public override Declaration Parse(Parser parser, int?userDocumentFileId) { InitializeEP(); TableHeader columnOrdering = new TableHeader(); var declaration = Parser.InitializeDeclaration(parser.Adapter, columnOrdering, userDocumentFileId); declaration.Properties.Year = GetYear(); declaration.Properties.SheetTitle = FindTitleAboveTheTable(); var currentDeclarant = CreatePublicServant(columnOrdering); declaration.PublicServants.Add(currentDeclarant); var tables = Document.Descendants <Table>().ToList(); string lastTableProcessor = ""; Table lastTable; foreach (var table in tables) { var rows = table.Descendants <TableRow>().ToList(); if (rows.Count == 0) { continue; } var cells = rows[0].Descendants <TableCell>().ToList(); var rowText = rows[0].InnerText.OnlyRussianLowercase(); var firstCellText = cells[0].InnerText.OnlyRussianLowercase(); if (firstCellText == "замещаемаядолжность") { ProcessPositionTable(table, currentDeclarant); lastTableProcessor = "Position"; } else if (rowText.Contains("ппвиддоходавеличинадоходаруб")) { ProcessIncomeTable(table, currentDeclarant); lastTableProcessor = "Income"; } else if (rowText.Contains("ппвидимущества") && rowText.Contains("собственникимущества")) { ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.OwnedString); lastTableProcessor = "RealEstateOwned"; } else if (rowText.Contains("ппвидимущества") && rowText.Contains("находитсявпользовании")) { ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.StateString); lastTableProcessor = "RealEstateState"; } else if (rowText.Contains("видимаркатранспорт") && rowText.Contains("собственник")) { ParseVehicleTable(table, currentDeclarant); lastTableProcessor = "Vehicle"; } else { switch (lastTableProcessor) { case "Vehicle": ParseVehicleTable(table, currentDeclarant); break; case "RealEstateState": ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.StateString); break; case "RealEstateOwned": ParseRealEstateTable(table, currentDeclarant, TRealtyCellSpan.OwnedString); break; } } lastTable = table; } return(declaration); }
static public void MapColumnTitlesToInnerConstants(IAdapter adapter, List <Cell> cells, TableHeader columnOrdering) { foreach (var cell in cells) { string text = cell.GetText(true); Logger.Debug(string.Format("column title: \"{0}\"[{1}]", text.ReplaceEolnWithSpace().CoalesceWhitespace(), cell.CellWidth)); DeclarationField field; string clean_text = AbsenceMarkers.Aggregate(text, (x, y) => x.Replace(y, "")).Trim(); if (adapter.GetRowsCount() == cell.MergedRowsCount) { continue; } if ((text == "" || clean_text.Length <= 1) && (text != "№")) { // too short title, try to predict by values field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell); Logger.Debug("Predict: " + field.ToString()); } else { field = HeaderHelpers.TryGetField(cell.TextAbove, text); if ((field == DeclarationField.None) && clean_text.Length <= 4) { field = ColumnByDataPredictor.PredictEmptyColumnTitle(adapter, cell); Logger.Debug("Predict: " + field.ToString()); } if (field == DeclarationField.None) { throw new SmartParserException(String.Format("Cannot recognize field \"{0}\"", text.Replace('\n', ' '))); } } if (field == DeclarationField.None && !DataHelper.IsEmptyValue(text)) { throw new ColumnDetectorException(String.Format("Fail to detect column type row: {0} title:{1}", cell.Row, text)); } if (ColumnByDataPredictor.CalcPrecision) { ColumnByDataPredictor.PredictForPrecisionCheck(adapter, cell, field); } AddColumn(columnOrdering, field, cell); if (TableHeader.SearchForFioColumnOnly) { if (HeaderHelpers.IsNameDeclarationField(field)) { break; } } } }