コード例 #1
0
        static public void MapStringsToConstants(IAdapter adapter, List <Cell> cells, ColumnOrdering columnOrdering)
        {
            foreach (var cell in cells)
            {
                string text = cell.GetText(true);
                Logger.Debug(string.Format("column title: \"{0}\"[{1}]", text.ReplaceEolnWithSpace().CoalesceWhitespace(), cell.CellWidth));
                DeclarationField field;
                string           clean_text = AbsenceMarkers.Aggregate(text, (x, y) => x.Replace(y, "")).Trim();

                if (adapter.GetRowsCount() == cell.MergedRowsCount)
                {
                    continue;
                }

                if ((text == "" || clean_text.Length <= 1) && (text != "№"))
                {
                    // too short title, try to predict by values
                    field = ColumnPredictor.PredictEmptyColumnTitle(adapter, cell);
                    Logger.Debug("Predict: " + field.ToString());
                }
                else
                {
                    if (cell.TextAbove != null)
                    {
                        text = cell.TextAbove + " " + text;
                    }
                    field = HeaderHelpers.TryGetField(text.Replace('\n', ' '));
                    if ((field == DeclarationField.None) && clean_text.Length <= 4)
                    {
                        field = ColumnPredictor.PredictEmptyColumnTitle(adapter, cell);
                        Logger.Debug("Predict: " + field.ToString());
                    }
                    if (field == DeclarationField.None)
                    {
                        throw new SmartParserException(String.Format("Cannot recognize field \"{0}\"", text.Replace('\n', ' ')));
                    }
                }

                if (field == DeclarationField.None && !DataHelper.IsEmptyValue(text))
                {
                    throw new ColumnDetectorException(String.Format("Fail to detect column type row: {0} title:{1}", cell.Row, text));
                }
                if (ColumnPredictor.CalcPrecision)
                {
                    ColumnPredictor.PredictForPrecisionCheck(adapter, cell, field);
                }

                AddColumn(columnOrdering, field, cell);
                if (ColumnOrdering.SearchForFioColumnOnly)
                {
                    if (field == DeclarationField.NameAndOccupationOrRelativeType ||
                        field == DeclarationField.NameOrRelativeType)
                    {
                        break;
                    }
                }
            }
        }
コード例 #2
0
        public void TwoRowHeaderEmptyTopCellTest2()
        {
            string   xlsxFile = Path.Combine(TestUtil.GetTestDataPath(), "customs-tworow-header.xls");
            IAdapter adapter  = AsposeExcelAdapter.CreateAdapter(xlsxFile);

            ColumnPredictor.InitializeIfNotAlready();
            ColumnOrdering ordering = ColumnDetector.ExamineTableBeginning(adapter);

            Assert.AreEqual(ordering.ColumnOrder.Count, 14);
            Assert.AreEqual(ordering.ColumnOrder[DeclarationField.Occupation].BeginColumn, 2);
        }
コード例 #3
0
        public void FixVehicleColumns()
        {
            string   xlsxFile = Path.Combine(TestUtil.GetTestDataPath(), "17497.xls");
            IAdapter adapter  = AsposeExcelAdapter.CreateAdapter(xlsxFile, -1);

            ColumnPredictor.InitializeIfNotAlready();

            ColumnOrdering ordering = ColumnDetector.ExamineTableBeginning(adapter);

            Assert.AreEqual(15, ordering.ColumnOrder.Count);
            Assert.IsTrue(ordering.ContainsField(DeclarationField.VehicleType));
            Assert.IsTrue(ordering.ContainsField(DeclarationField.VehicleModel));
            Assert.IsFalse(ordering.ContainsField(DeclarationField.Vehicle));
        }
コード例 #4
0
        static Dictionary <DeclarationField, Cell> MapByOrderAndIntersection(ColumnOrdering columnOrdering, List <Cell> cells)
        {
            if (columnOrdering.MergedColumnOrder.Count != cells.Count)
            {
                return(null);
            }
            int start           = cells[0].AdditTableIndention;
            var res             = new Dictionary <DeclarationField, Cell>();
            int pixelErrorCount = 0;

            for (int i = 0; i < cells.Count; i++)
            {
                int s1      = start;
                int e1      = start + cells[i].CellWidth;
                var colInfo = columnOrdering.MergedColumnOrder[i];
                int s2      = colInfo.ColumnPixelStart;
                int e2      = colInfo.ColumnPixelStart + colInfo.ColumnPixelWidth;
                if (ColumnOrdering.PeriodIntersection(s1, e1, s2, e2) == 0)
                {
                    pixelErrorCount += 1;
                    if (!DataHelper.IsEmptyValue(cells[i].Text))
                    {
                        if (!ColumnPredictor.TestFieldWithoutOwntypes(colInfo.Field, cells[i]))
                        {
                            Logger.Debug(string.Format("cannot map column N={0} text={1}", i, cells[i].Text.Replace("\n", "\\n")));
                            return(null);
                        }
                        else
                        {
                            Logger.Debug(string.Format("found semantic argument for mapping N={0} text={1} to {2}",
                                                       i, cells[i].Text.Replace("\n", "\\n"), colInfo.Field));
                            pixelErrorCount = 0;
                        }
                    }
                }
                res[columnOrdering.MergedColumnOrder[i].Field] = cells[i];

                start = e1;
            }
            if (pixelErrorCount >= 3)
            {
                return(null);
            }
            return(res);
        }
コード例 #5
0
        public void EmptyRealStateTypeColumnDetectorTest1()
        {
            string   xlsxFile = Path.Combine(TestUtil.GetTestDataPath(), "rabotniki_podved_organizacii_2013.xlsx");
            IAdapter adapter  = AsposeExcelAdapter.CreateAdapter(xlsxFile);

            ColumnPredictor.InitializeIfNotAlready();
            ColumnOrdering ordering = ColumnDetector.ExamineTableBeginning(adapter);

            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.Number].BeginColumn == 0);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.NameOrRelativeType].BeginColumn == 1);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.Occupation].BeginColumn == 2);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateType].BeginColumn == 3);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateOwnershipType].BeginColumn == 4);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateSquare].BeginColumn == 5);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.OwnedRealEstateCountry].BeginColumn == 6);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.StatePropertyType].BeginColumn == 7);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.StatePropertySquare].BeginColumn == 8);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.StatePropertyCountry].BeginColumn == 9);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.Vehicle].BeginColumn == 10);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.DeclaredYearlyIncome].BeginColumn == 11);
            Assert.IsTrue(ordering.ColumnOrder[DeclarationField.DataSources].BeginColumn == 12);
        }
コード例 #6
0
        public Declaration Parse(ColumnOrdering columnOrdering, bool updateTrigrams, int?documentfile_id)
        {
            FirstPassStartTime = DateTime.Now;

            Declaration declaration = InitializeDeclaration(columnOrdering, documentfile_id);

            int rowOffset = columnOrdering.FirstDataRow;

            TBorderFinder borderFinder = new TBorderFinder(declaration, FailOnRelativeOrphan);

            if (columnOrdering.Section != null)
            {
                borderFinder.CreateNewSection(rowOffset, columnOrdering.Section);
            }

            bool   skipEmptyPerson = false;
            string prevPersonName  = "";

            for (int row = rowOffset; row < Adapter.GetRowsCount(); row++)
            {
                DataRow currRow = Adapter.GetRow(columnOrdering, row);
                if (currRow == null || currRow.IsEmpty())
                {
                    continue;
                }
                if (IsNumbersRow(currRow))
                {
                    continue;
                }
                Logger.Debug(String.Format("currRow {1}: {0}", currRow.DebugString(), row));

                string sectionName;
                if (IAdapter.IsSectionRow(currRow.Cells, columnOrdering.GetMaxColumnEndIndex(), false, out sectionName))
                {
                    borderFinder.CreateNewSection(row, sectionName);
                    continue;
                }
                {
                    ColumnOrdering newColumnOrdering;
                    if (IsHeaderRow(currRow, out newColumnOrdering))
                    {
                        columnOrdering = newColumnOrdering;
                        row            = newColumnOrdering.GetPossibleHeaderEnd() - 1; // row++ in "for" cycle
                        continue;
                    }
                }

                if (updateTrigrams)
                {
                    ColumnPredictor.UpdateByRow(columnOrdering, currRow);
                }

                if (!currRow.InitPersonData(prevPersonName))
                {
                    // be robust, ignore errors see 8562.pdf.docx in tests
                    continue;
                }

                if (currRow.PersonName != String.Empty)
                {
                    prevPersonName = currRow.PersonName;
                    borderFinder.CreateNewDeclarant(Adapter, currRow);
                    if (borderFinder.CurrentPerson != null)
                    {
                        skipEmptyPerson = false;
                    }
                }
                else if (currRow.RelativeType != String.Empty)
                {
                    if (!skipEmptyPerson)
                    {
                        try
                        {
                            borderFinder.CreateNewRelative(currRow);
                        }
                        catch (SmartParserRelativeWithoutPersonException e)
                        {
                            skipEmptyPerson = true;
                            Logger.Error(e.Message);
                            continue;
                        }
                    }
                }
                else
                {
                    if (borderFinder.CurrentPerson == null && FailOnRelativeOrphan)
                    {
                        skipEmptyPerson = true;
                        Logger.Error(String.Format("No person to attach info on row={0}", row));
                        continue;
                    }
                }
                if (!skipEmptyPerson)
                {
                    borderFinder.AddInputRowToCurrentPerson(columnOrdering, currRow);
                }
            }
            if (updateTrigrams)
            {
                ColumnPredictor.WriteData();
            }

            Logger.Info("Parsed {0} declarants", declaration.PublicServants.Count());
            if (!ColumnOrdering.SearchForFioColumnOnly)
            {
                ParsePersonalProperties(declaration);
            }

            return(declaration);
        }
コード例 #7
0
        public static int Main(string[] args)
        {
            string inputFile = ParseArgs(args);

            Logger.Info("Command line: " + String.Join(" ", args));
            if (String.IsNullOrEmpty(inputFile))
            {
                Console.WriteLine("no input file or directory");
                return(1);
            }


            if (IsDirectory(inputFile))
            {
                return(ParseDirectory(inputFile));
            }

            if (inputFile.Contains("*") || inputFile.Contains("?") || inputFile.StartsWith("@"))
            {
                return(ParseByFileMask(inputFile));
            }

            try
            {
                Logger.SetOutSecond();
                if (OutFile == "")
                {
                    OutFile = BuildOutFileNameByInput(inputFile);
                }

                ParseFile(inputFile, OutFile);
            }
            catch (SmartParserException e)
            {
                Logger.Error("Parsing Exception " + e.ToString());
            }
            catch (Exception e)
            {
                Logger.Error("Unknown Parsing Exception " + e.ToString());
                Logger.Info("Stack: " + e.StackTrace);
            }
            finally
            {
                Logger.SetOutMain();
            }

            if (ColumnPredictor.CalcPrecision)
            {
                Logger.Info(ColumnPredictor.GetPrecisionStr());
            }

            if (Logger.Errors.Count() > 0)
            {
                Logger.Info("*** Errors ({0}):", Logger.Errors.Count());

                foreach (string e in Logger.Errors)
                {
                    Logger.Info(e);
                }
            }

            return(0);
        }
コード例 #8
0
        public static int ParseFile(string inputFile, string outFile)
        {
            if (CheckJson && File.Exists(outFile))
            {
                Logger.Info("JSON file {0} already exist", outFile);
                return(0);
            }

            if (!File.Exists(inputFile))
            {
                Logger.Info("ERROR: {0} file NOT exists", inputFile);
                return(0);
            }


            ColumnPredictor.InitializeIfNotAlready();

            string logFile = Path.Combine(Path.GetDirectoryName(inputFile),
                                          Path.GetFileName(inputFile) + ".log");

            Logger.SetSecondLogFileName(Path.GetFullPath(logFile));

            Logger.Info(String.Format("Parsing {0}", inputFile));
            IAdapter adapter = GetAdapter(inputFile);

            Logger.Info(String.Format("TablesCount = {0}", adapter.GetTablesCount()));
            Logger.Info(String.Format("RowsCount = {0}", adapter.GetRowsCount()));

            if (adapter.GetTablesCount() == 0 && !inputFile.EndsWith(".toloka_json"))
            {
                throw new SmartParserException("No tables found in document");
            }

            if (HtmlFileName != "")
            {
                adapter.WriteHtmlFile(HtmlFileName);
            }

            if (adapter.GetWorkSheetCount() > 1)
            {
                Logger.Info(String.Format("File has multiple ({0}) worksheets", adapter.GetWorkSheetCount()));
                Declaration allDeclarations = null;
                for (int sheetIndex = 0; sheetIndex < adapter.GetWorkSheetCount(); sheetIndex++)
                {
                    adapter.SetCurrentWorksheet(sheetIndex);
                    try
                    {
                        if (DeclarationSerializer.SmartParserJsonFormat == SmartParserJsonFormatEnum.Disclosures)
                        {
                            var sheetDeclarations = BuildDeclarations(adapter, inputFile);
                            if (allDeclarations == null)
                            {
                                allDeclarations = sheetDeclarations;
                            }
                            else
                            {
                                allDeclarations.AddDeclarations(sheetDeclarations);
                            }
                        }
                        else
                        {
                            string curOutFile = outFile.Replace(".json", "_" + sheetIndex.ToString() + ".json");
                            Logger.Info(String.Format("Parsing worksheet {0} into file {1}", sheetIndex, curOutFile));
                            WriteOutputJson(inputFile, BuildDeclarations(adapter, inputFile), curOutFile);
                        }
                    }
                    catch (ColumnDetectorException)
                    {
                        Logger.Info(String.Format("Skipping empty sheet {0} (No headers found exception thrown)",
                                                  sheetIndex));
                    }
                    if (allDeclarations != null)
                    {
                        WriteOutputJson(inputFile, allDeclarations, outFile);
                    }
                }
            }
            else
            {
                WriteOutputJson(inputFile, BuildDeclarations(adapter, inputFile), outFile);
            }

            return(0);
        }
コード例 #9
0
        public static int ParseMultipleFiles(IEnumerable <string> files, string outputDir)
        {
            var parse_results = new Dictionary <string, List <string> >
            {
                { "ok", new List <string>() },
                { "error", new List <string>() },
                { "too_many_errors", new List <string>() },
                { "exception", new List <string>() },
            };

            foreach (string file in files)
            {
                Logger.Info("Parsing file " + file);
                bool caught = false;
                try
                {
                    Logger.SetOutSecond();
                    ParseFile(file, BuildOutFileNameByInput(file));
                }
                catch (SmartParserException e)
                {
                    caught = true;
                    Logger.Error("Parsing Exception " + e.ToString());
                    parse_results["exception"].Add(file);
                }
                catch (Exception e)
                {
                    caught = true;
                    Logger.Error("Parsing Exception " + e.ToString());
                    Logger.Debug("Stack: " + e.StackTrace);
                    parse_results["exception"].Add(file);
                }
                finally
                {
                    Logger.SetOutMain();
                }

                if (caught)
                {
                    Logger.Info("Result: Exception");
                }

                if (!caught && Logger.Errors.Count() > 0)
                {
                    Logger.Info("Result: error");
                    parse_results["error"].Add(file);
                }

                if (!caught && Logger.Errors.Count() == 0)
                {
                    Logger.Info("Result: OK");
                    parse_results["ok"].Add(file);
                }

                if (Logger.Errors.Count() > 0)
                {
                    Logger.Info(" Parsing errors ({0})", Logger.Errors.Count());

                    foreach (string e in Logger.Errors)
                    {
                        Logger.Info(e);
                    }
                }
            }

            Logger.Info("Parsing Results:");

            foreach (var key_value in parse_results)
            {
                Logger.Info("Result: {0} ({1})", key_value.Key, key_value.Value.Count());
                foreach (string file in key_value.Value)
                {
                    Logger.Info(file);
                }
            }

            if (Logger.UnknownRealEstate.Count() > 0)
            {
                Logger.Info("UnknownRealEstate.Count: {0}", Logger.UnknownRealEstate.Count());
                string content  = string.Join("\n", Logger.UnknownRealEstate);
                string dictfile = Path.Combine(outputDir, "UnknownRealEstate.txt");
                File.WriteAllText(dictfile, content);
                Logger.Info("Output UnknownRealEstate to file {0}", dictfile);
            }

            if (ColumnPredictor.CalcPrecision)
            {
                Logger.Info(ColumnPredictor.GetPrecisionStr());
            }

            return(0);
        }