static public void MapStringsToConstants(IAdapter adapter, List <Cell> cells, ColumnOrdering columnOrdering) { foreach (var cell in cells) { string text = cell.GetText(true); Logger.Debug(string.Format("column title: \"{0}\"[{1}]", text.ReplaceEolnWithSpace().CoalesceWhitespace(), cell.CellWidth)); DeclarationField field; string clean_text = AbsenceMarkers.Aggregate(text, (x, y) => x.Replace(y, "")).Trim(); if (adapter.GetRowsCount() == cell.MergedRowsCount) { continue; } if ((text == "" || clean_text.Length <= 1) && (text != "№")) { // too short title, try to predict by values field = ColumnPredictor.PredictEmptyColumnTitle(adapter, cell); Logger.Debug("Predict: " + field.ToString()); } else { if (cell.TextAbove != null) { text = cell.TextAbove + " " + text; } field = HeaderHelpers.TryGetField(text.Replace('\n', ' ')); if ((field == DeclarationField.None) && clean_text.Length <= 4) { field = ColumnPredictor.PredictEmptyColumnTitle(adapter, cell); Logger.Debug("Predict: " + field.ToString()); } if (field == DeclarationField.None) { throw new SmartParserException(String.Format("Cannot recognize field \"{0}\"", text.Replace('\n', ' '))); } } if (field == DeclarationField.None && !DataHelper.IsEmptyValue(text)) { throw new ColumnDetectorException(String.Format("Fail to detect column type row: {0} title:{1}", cell.Row, text)); } if (ColumnPredictor.CalcPrecision) { ColumnPredictor.PredictForPrecisionCheck(adapter, cell, field); } AddColumn(columnOrdering, field, cell); if (ColumnOrdering.SearchForFioColumnOnly) { if (field == DeclarationField.NameAndOccupationOrRelativeType || field == DeclarationField.NameOrRelativeType) { break; } } } }
public void TwoRowHeaderEmptyTopCellTest2() { string xlsxFile = Path.Combine(TestUtil.GetTestDataPath(), "customs-tworow-header.xls"); IAdapter adapter = AsposeExcelAdapter.CreateAdapter(xlsxFile); ColumnPredictor.InitializeIfNotAlready(); ColumnOrdering ordering = ColumnDetector.ExamineTableBeginning(adapter); Assert.AreEqual(ordering.ColumnOrder.Count, 14); Assert.AreEqual(ordering.ColumnOrder[DeclarationField.Occupation].BeginColumn, 2); }
public void FixVehicleColumns() { string xlsxFile = Path.Combine(TestUtil.GetTestDataPath(), "17497.xls"); IAdapter adapter = AsposeExcelAdapter.CreateAdapter(xlsxFile, -1); ColumnPredictor.InitializeIfNotAlready(); ColumnOrdering ordering = ColumnDetector.ExamineTableBeginning(adapter); Assert.AreEqual(15, ordering.ColumnOrder.Count); Assert.IsTrue(ordering.ContainsField(DeclarationField.VehicleType)); Assert.IsTrue(ordering.ContainsField(DeclarationField.VehicleModel)); Assert.IsFalse(ordering.ContainsField(DeclarationField.Vehicle)); }
static Dictionary <DeclarationField, Cell> MapByOrderAndIntersection(ColumnOrdering columnOrdering, List <Cell> cells) { if (columnOrdering.MergedColumnOrder.Count != cells.Count) { return(null); } int start = cells[0].AdditTableIndention; var res = new Dictionary <DeclarationField, Cell>(); int pixelErrorCount = 0; for (int i = 0; i < cells.Count; i++) { int s1 = start; int e1 = start + cells[i].CellWidth; var colInfo = columnOrdering.MergedColumnOrder[i]; int s2 = colInfo.ColumnPixelStart; int e2 = colInfo.ColumnPixelStart + colInfo.ColumnPixelWidth; if (ColumnOrdering.PeriodIntersection(s1, e1, s2, e2) == 0) { pixelErrorCount += 1; if (!DataHelper.IsEmptyValue(cells[i].Text)) { if (!ColumnPredictor.TestFieldWithoutOwntypes(colInfo.Field, cells[i])) { Logger.Debug(string.Format("cannot map column N={0} text={1}", i, cells[i].Text.Replace("\n", "\\n"))); return(null); } else { Logger.Debug(string.Format("found semantic argument for mapping N={0} text={1} to {2}", i, cells[i].Text.Replace("\n", "\\n"), colInfo.Field)); pixelErrorCount = 0; } } } res[columnOrdering.MergedColumnOrder[i].Field] = cells[i]; start = e1; } if (pixelErrorCount >= 3) { return(null); } return(res); }
public void EmptyRealStateTypeColumnDetectorTest1() { string xlsxFile = Path.Combine(TestUtil.GetTestDataPath(), "rabotniki_podved_organizacii_2013.xlsx"); IAdapter adapter = AsposeExcelAdapter.CreateAdapter(xlsxFile); ColumnPredictor.InitializeIfNotAlready(); ColumnOrdering ordering = ColumnDetector.ExamineTableBeginning(adapter); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.Number].BeginColumn == 0); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.NameOrRelativeType].BeginColumn == 1); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.Occupation].BeginColumn == 2); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateType].BeginColumn == 3); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateOwnershipType].BeginColumn == 4); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateSquare].BeginColumn == 5); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateCountry].BeginColumn == 6); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.StatePropertyType].BeginColumn == 7); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.StatePropertySquare].BeginColumn == 8); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.StatePropertyCountry].BeginColumn == 9); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.Vehicle].BeginColumn == 10); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.DeclaredYearlyIncome].BeginColumn == 11); Assert.IsTrue(ordering.ColumnOrder[DeclarationField.DataSources].BeginColumn == 12); }
public Declaration Parse(ColumnOrdering columnOrdering, bool updateTrigrams, int?documentfile_id) { FirstPassStartTime = DateTime.Now; Declaration declaration = InitializeDeclaration(columnOrdering, documentfile_id); int rowOffset = columnOrdering.FirstDataRow; TBorderFinder borderFinder = new TBorderFinder(declaration, FailOnRelativeOrphan); if (columnOrdering.Section != null) { borderFinder.CreateNewSection(rowOffset, columnOrdering.Section); } bool skipEmptyPerson = false; string prevPersonName = ""; for (int row = rowOffset; row < Adapter.GetRowsCount(); row++) { DataRow currRow = Adapter.GetRow(columnOrdering, row); if (currRow == null || currRow.IsEmpty()) { continue; } if (IsNumbersRow(currRow)) { continue; } Logger.Debug(String.Format("currRow {1}: {0}", currRow.DebugString(), row)); string sectionName; if (IAdapter.IsSectionRow(currRow.Cells, columnOrdering.GetMaxColumnEndIndex(), false, out sectionName)) { borderFinder.CreateNewSection(row, sectionName); continue; } { ColumnOrdering newColumnOrdering; if (IsHeaderRow(currRow, out newColumnOrdering)) { columnOrdering = newColumnOrdering; row = newColumnOrdering.GetPossibleHeaderEnd() - 1; // row++ in "for" cycle continue; } } if (updateTrigrams) { ColumnPredictor.UpdateByRow(columnOrdering, currRow); } if (!currRow.InitPersonData(prevPersonName)) { // be robust, ignore errors see 8562.pdf.docx in tests continue; } if (currRow.PersonName != String.Empty) { prevPersonName = currRow.PersonName; borderFinder.CreateNewDeclarant(Adapter, currRow); if (borderFinder.CurrentPerson != null) { skipEmptyPerson = false; } } else if (currRow.RelativeType != String.Empty) { if (!skipEmptyPerson) { try { borderFinder.CreateNewRelative(currRow); } catch (SmartParserRelativeWithoutPersonException e) { skipEmptyPerson = true; Logger.Error(e.Message); continue; } } } else { if (borderFinder.CurrentPerson == null && FailOnRelativeOrphan) { skipEmptyPerson = true; Logger.Error(String.Format("No person to attach info on row={0}", row)); continue; } } if (!skipEmptyPerson) { borderFinder.AddInputRowToCurrentPerson(columnOrdering, currRow); } } if (updateTrigrams) { ColumnPredictor.WriteData(); } Logger.Info("Parsed {0} declarants", declaration.PublicServants.Count()); if (!ColumnOrdering.SearchForFioColumnOnly) { ParsePersonalProperties(declaration); } return(declaration); }
public static int Main(string[] args) { string inputFile = ParseArgs(args); Logger.Info("Command line: " + String.Join(" ", args)); if (String.IsNullOrEmpty(inputFile)) { Console.WriteLine("no input file or directory"); return(1); } if (IsDirectory(inputFile)) { return(ParseDirectory(inputFile)); } if (inputFile.Contains("*") || inputFile.Contains("?") || inputFile.StartsWith("@")) { return(ParseByFileMask(inputFile)); } try { Logger.SetOutSecond(); if (OutFile == "") { OutFile = BuildOutFileNameByInput(inputFile); } ParseFile(inputFile, OutFile); } catch (SmartParserException e) { Logger.Error("Parsing Exception " + e.ToString()); } catch (Exception e) { Logger.Error("Unknown Parsing Exception " + e.ToString()); Logger.Info("Stack: " + e.StackTrace); } finally { Logger.SetOutMain(); } if (ColumnPredictor.CalcPrecision) { Logger.Info(ColumnPredictor.GetPrecisionStr()); } if (Logger.Errors.Count() > 0) { Logger.Info("*** Errors ({0}):", Logger.Errors.Count()); foreach (string e in Logger.Errors) { Logger.Info(e); } } return(0); }
public static int ParseFile(string inputFile, string outFile) { if (CheckJson && File.Exists(outFile)) { Logger.Info("JSON file {0} already exist", outFile); return(0); } if (!File.Exists(inputFile)) { Logger.Info("ERROR: {0} file NOT exists", inputFile); return(0); } ColumnPredictor.InitializeIfNotAlready(); string logFile = Path.Combine(Path.GetDirectoryName(inputFile), Path.GetFileName(inputFile) + ".log"); Logger.SetSecondLogFileName(Path.GetFullPath(logFile)); Logger.Info(String.Format("Parsing {0}", inputFile)); IAdapter adapter = GetAdapter(inputFile); Logger.Info(String.Format("TablesCount = {0}", adapter.GetTablesCount())); Logger.Info(String.Format("RowsCount = {0}", adapter.GetRowsCount())); if (adapter.GetTablesCount() == 0 && !inputFile.EndsWith(".toloka_json")) { throw new SmartParserException("No tables found in document"); } if (HtmlFileName != "") { adapter.WriteHtmlFile(HtmlFileName); } if (adapter.GetWorkSheetCount() > 1) { Logger.Info(String.Format("File has multiple ({0}) worksheets", adapter.GetWorkSheetCount())); Declaration allDeclarations = null; for (int sheetIndex = 0; sheetIndex < adapter.GetWorkSheetCount(); sheetIndex++) { adapter.SetCurrentWorksheet(sheetIndex); try { if (DeclarationSerializer.SmartParserJsonFormat == SmartParserJsonFormatEnum.Disclosures) { var sheetDeclarations = BuildDeclarations(adapter, inputFile); if (allDeclarations == null) { allDeclarations = sheetDeclarations; } else { allDeclarations.AddDeclarations(sheetDeclarations); } } else { string curOutFile = outFile.Replace(".json", "_" + sheetIndex.ToString() + ".json"); Logger.Info(String.Format("Parsing worksheet {0} into file {1}", sheetIndex, curOutFile)); WriteOutputJson(inputFile, BuildDeclarations(adapter, inputFile), curOutFile); } } catch (ColumnDetectorException) { Logger.Info(String.Format("Skipping empty sheet {0} (No headers found exception thrown)", sheetIndex)); } if (allDeclarations != null) { WriteOutputJson(inputFile, allDeclarations, outFile); } } } else { WriteOutputJson(inputFile, BuildDeclarations(adapter, inputFile), outFile); } return(0); }
public static int ParseMultipleFiles(IEnumerable <string> files, string outputDir) { var parse_results = new Dictionary <string, List <string> > { { "ok", new List <string>() }, { "error", new List <string>() }, { "too_many_errors", new List <string>() }, { "exception", new List <string>() }, }; foreach (string file in files) { Logger.Info("Parsing file " + file); bool caught = false; try { Logger.SetOutSecond(); ParseFile(file, BuildOutFileNameByInput(file)); } catch (SmartParserException e) { caught = true; Logger.Error("Parsing Exception " + e.ToString()); parse_results["exception"].Add(file); } catch (Exception e) { caught = true; Logger.Error("Parsing Exception " + e.ToString()); Logger.Debug("Stack: " + e.StackTrace); parse_results["exception"].Add(file); } finally { Logger.SetOutMain(); } if (caught) { Logger.Info("Result: Exception"); } if (!caught && Logger.Errors.Count() > 0) { Logger.Info("Result: error"); parse_results["error"].Add(file); } if (!caught && Logger.Errors.Count() == 0) { Logger.Info("Result: OK"); parse_results["ok"].Add(file); } if (Logger.Errors.Count() > 0) { Logger.Info(" Parsing errors ({0})", Logger.Errors.Count()); foreach (string e in Logger.Errors) { Logger.Info(e); } } } Logger.Info("Parsing Results:"); foreach (var key_value in parse_results) { Logger.Info("Result: {0} ({1})", key_value.Key, key_value.Value.Count()); foreach (string file in key_value.Value) { Logger.Info(file); } } if (Logger.UnknownRealEstate.Count() > 0) { Logger.Info("UnknownRealEstate.Count: {0}", Logger.UnknownRealEstate.Count()); string content = string.Join("\n", Logger.UnknownRealEstate); string dictfile = Path.Combine(outputDir, "UnknownRealEstate.txt"); File.WriteAllText(dictfile, content); Logger.Info("Output UnknownRealEstate to file {0}", dictfile); } if (ColumnPredictor.CalcPrecision) { Logger.Info(ColumnPredictor.GetPrecisionStr()); } return(0); }