示例#1
0
        public static void ValidateResizeBlock(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-test-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red))
            .ParseBlock <RemoveOverlapedImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            .ParseBlock <BreakColumns>()
            //.Validate<RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
            //.Validate<RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveFooter>()
            .ParseBlock <RemoveHeaderImage>()
            .ParseBlock <AddTableSpace>()
            .ParseBlock <AddImageSpace>()
            //.ParseBlock<BreakInlineElements>()
            .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red));

            pipeline.Done();
        }
示例#2
0
        public static void ResizeBlocksets(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-table-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            .ParseBlock <BreakColumns>()
            .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
            .Validate <RemoveHeader>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveFooter>()
            .ParseBlock <RemoveHeader>()
            .ParseBlock <AddTableSpace>()
            .ParseBlock <ResizeBlocksets>()
            .ParseBlock <OrderBlocksets>()
            .Show(Color.Orange)
            .ShowLine(Color.Black);

            pipeline.Done();
        }
示例#3
0
        public static void CorrectOrder(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/09-orders-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .ParseBlock <RemoveOverlapedImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveSmallFonts>()
            .ParseBlock <MergeTableText>()
            .ParseBlock <HighlightTextTable>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveHeaderImage>()
            .ParseBlock <FindInitialBlocksetWithRewind>()
            .ParseBlock <BreakColumnsLight>()
            .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveFooter>()
            .ParseBlock <AddTableSpace>()
            .ParseBlock <AddImageSpace>()
            .ParseBlock <BreakInlineElements>()
            .ParseBlock <ResizeBlocksets>()
            .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red))
            .ParseBlock <OrderBlocksets>()
            .Show(Color.Orange)
            .ShowLine(Color.Black);

            pipeline.Done();
        }
        public static int Process(string basename, string inputfolder, string outputfolder)
        {
            //PdfReaderException.DisableWarnings();
            //PdfReaderException.ContinueOnException();

            var pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"{inputfolder}/{basename}.pdf")
                //.Output($"{outputfolder}/{basename}-output.pdf")
                .AllPagesExcept <CreateTextLines>(new int[] { }, page =>
                                                  page.ParsePdf <ProcessPdfValidation>()
                                                  //.Show(Color.White)
                                                  .ParseBlock <IdentifyValidationMarks>()
                                                  .PdfCheck <CheckNoBlockSetOverlap>(Color.Orange)
                                                  //.Show(Color.Blue)
                                                  ).ToList();

            pipeline.SaveOk($"{outputfolder}/{basename}-ok.pdf");
            int errors = pipeline.SaveErrors($"{outputfolder}/errors/{basename}-errors.pdf");

            pipeline.Done();

            return(errors);
        }
示例#5
0
        public static void ShowHeaderFooter(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-header-footer-output.pdf")
            .AllPages(page =>
            {
                page.ParsePdf <PreProcessTables>()
                .ParseBlock <IdentifyTables>()
                .Show(Color.Red)
                .ParsePdf <PreProcessImages>()
                .Show(Color.Green)
                .ParseBlock <RemoveOverlapedImages>()
                .ParsePdf <ProcessPdfText>()
                .ParseBlock <RemoveSmallFonts>()
                .ParseBlock <MergeTableText>()
                .ParseBlock <HighlightTextTable>()
                .ParseBlock <RemoveTableText>()
                .ParseBlock <GroupLines>()
                .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
                .ParseBlock <RemoveHeaderImage>()

                .ParseBlock <FindInitialBlocksetWithRewind>()
                .ParseBlock <BreakColumnsLight>()
                //.ParseBlock<BreakColumns>()
                .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
                .ParseBlock <RemoveFooter>();
            });

            pipeline.Done();
        }
示例#6
0
        public static void MultipageCore(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-page-output.pdf")
            .AllPages(p => ProcessPage(p));

            pipeline.Done();
        }
示例#7
0
        public static void MarkAllComponents(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tmp-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .Show(Color.Yellow);

            pipeline.Done();
        }
示例#8
0
        public static void GroupLines(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf").Page(1)
            .Output($"bin/{basename}-tmp-output.pdf")
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .Show(Color.Orange);

            pipeline.Done();
        }
示例#9
0
        public static void Blocks(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/01-blocks-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .Show(Color.Orange);

            pipeline.Done();
        }
示例#10
0
        public static void ShowRenderPath(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tmp-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessRenderPath>()
            .ShowLine(Color.Green);

            pipeline.Done();
        }
示例#11
0
        public static void ProcessImages(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-img-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessImages>()
            .Show(Color.Red);

            pipeline.Done();
        }
示例#12
0
        public static void FollowText(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-follow-text-output.pdf")
            .AllPages(page => page
                      .ParsePdf <ProcessPdfText>()
                      .ShowLine(Color.Orange)
                      );

            pipeline.Done();
        }
示例#13
0
        public static void BlockLines(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/02-blockline-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .Show(Color.Red);

            pipeline.Done();
        }
示例#14
0
        public static void FindIds(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/10-ids-{basename}-output.pdf")
            .AllPages(page =>
            {
                page.ParsePdf <ProcessPdfText>()
                .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green));
            });

            pipeline.Done();
        }
示例#15
0
        public static void ShowTables(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tables.pdf")
            .AllPages(page => page.ParsePdf <PreProcessTables>()
                      .ParseBlock <IdentifyTables>()
                      .Show(Color.Green)
                      .Validate <CheckOverlap>().ShowErrors(b => b.Show(Color.Red))
                      );

            pipeline.Done();
        }
示例#16
0
        public static void ShowTables(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-table-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .Show(Color.Yellow)
            .ParseBlock <IdentifyTables>()
            .Show(Color.Green);

            pipeline.Done();
        }
示例#17
0
        public static void RemoveOverlapedImages(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tmp-output.pdf")
            .Page(1)
            //.ParsePdf<PreProcessTables>()
            //    .ParseBlock<IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red))
            .ParseBlock <RemoveOverlapedImages>()
            .Show(Color.Green);
            pipeline.Done();
        }
示例#18
0
        public static void FollowLine(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/04-followline-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .ShowLine(Color.Green)
            .ParseBlock <FindInitialBlockset>()
            .Show(Color.Orange);

            pipeline.Done();
        }
示例#19
0
        public static void FollowText(IVirtualFS virtualFS, string basename)
        {
            VirtualFS.ConfigureFileSystem(virtualFS);

            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{basename}.pdf")
            .Output($"{basename}-follow-text-output.pdf")
            .AllPages(page => page
                      .ParsePdf <ProcessPdfText>()
                      .ShowLine(Color.Orange)
                      );

            pipeline.Done();
        }
示例#20
0
        public static void RemoveTables(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-table-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .Show(Color.Green)
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .Show(Color.Red);

            pipeline.Done();
        }
        public static void ProcessPage1(string basename, string inputfolder, string outputfolder)
        {
            //PdfReaderException.DisableWarnings();
            //PdfReaderException.ContinueOnException();

            var pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"{inputfolder}/{basename}")
                .Output($"{outputfolder}/{basename}-invalid.pdf")
                .Page(1)
                .ParsePdf <ProcessPdfValidation>()
                .Show(Color.White)
                .ParseBlock <IdentifyValidationMarks>()
                .ParseBlock <CheckNoBlockSetOverlap>()
                .Show(Color.Blue);

            pipeline.Done();
        }
示例#22
0
        public static void AddImageSpace(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-img-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            .ParseBlock <AddTableSpace>()
            .ParseBlock <AddImageSpace>()
            .Show(Color.Orange);

            pipeline.Done();
        }
示例#23
0
        public static void BreakColumn(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/05-breakcolumn-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .ParseBlock <RemoveOverlapedImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveHeaderImage>()
            .ParseBlock <FindInitialBlocksetWithRewind>()
            .ParseBlock <BreakColumnsLight>()
            .Show(Color.Orange);

            pipeline.Done();
        }
        public static void Process(string basename, string inputfolder, string outputfolder)
        {
            BasicFirstPageStats.Reset();
            PdfReaderException.ContinueOnException();

            var pipeline = new Execution.Pipeline();

            var artigos = GetTextLines(pipeline, basename, inputfolder, outputfolder)
                          .ConvertText <CreateTextLineIndex, TextLine>()
                          .ConvertText <PreCreateStructures, TextLine2>()
                          .ConvertText <CreateStructures2, TextStructure>()
                          .ConvertText <PreCreateTextSegments, TextStructureAgg>()
                          .ConvertText <AggregateStructures, TextStructure>()
                          .ShowPdf <ShowStructureCentral>($"{outputfolder}/{basename}-show-central.pdf")
                          .ConvertText <CreateTextSegments, TextSegment>()
                          .ConvertText <CreateTreeSegments, TextSegment>()
                          .Log <AnalyzeTreeStructure>($"{outputfolder}/{basename}-tree.txt")
                          .ToList();

            pipeline.ExtractOutput <ShowParserWarnings>($"{outputfolder}/errors/{basename}-parser-errors.pdf");

            pipeline.Done();
        }
示例#25
0
        //public static void ValidateBreakColumns(string basename)
        //{
        //    var pipeline = new Execution.Pipeline();

        //    pipeline.Input($"bin/{basename}.pdf").Page(1)
        //            .Output($"bin/{basename}-tmp-output.pdf")
        //            .ParsePdf<ProcessPdfText>()
        //            .ParseBlock<GroupLines>()
        //            .ParseBlock<FindInitialBlockset>()
        //            .Validate<BreakColumns>()
        //            .ShowErrors(p => p.Show(Color.Purple));

        //    pipeline.Done();
        //}

        //public static void BreakColumns(string basename)
        //{
        //    var pipeline = new Execution.Pipeline();

        //    pipeline.Input($"bin/{basename}.pdf").Page(1)
        //            .Output($"bin/{basename}-tmp-output.pdf")
        //            .ParsePdf<ProcessPdfText>()
        //            .ParseBlock<GroupLines>()
        //            .ParseBlock<FindInitialBlockset>()
        //                .Validate<BreakColumns>().ShowErrors(p => p.Show(Color.LightGray))
        //                .ParseBlock<BreakColumns>()
        //                .Show(Color.Green)
        //                .Validate<BreakColumns>().ShowErrors(p => p.Show(Color.Red));

        //    pipeline.Done();
        //}

        //public static void RemoveHeaderFooter(string basename)
        //{
        //    var pipeline = new Execution.Pipeline();

        //    pipeline.Input($"bin/{basename}.pdf").Page(1)
        //            .Output($"bin/{basename}-tmp-output.pdf")
        //            .ParsePdf<ProcessPdfText>()
        //            .ParseBlock<GroupLines>()
        //            .ParseBlock<FindInitialBlockset>()
        //            .ParseBlock<BreakColumns>()
        //            .Validate<RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
        //            .Validate<RemoveHeader>().ShowErrors(p => p.Show(Color.Purple))
        //            .ParseBlock<RemoveFooter>()
        //            .ParseBlock<RemoveHeader>()
        //            .Show(Color.Yellow);

        //    pipeline.Done();
        //}

        public static void MergeBlockLines(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf").Page(1)
            .Output($"bin/{basename}-tmp-output.pdf")
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            //.ParseBlock<TestSplitBlocksets>()
            //.Show(Color.Red)
            .ShowLine(Color.Gray)
            .ParseBlock <MergeBlockLines>()
            .Show(Color.Green)
            //.ParseBlock<BreakColumns>()
            .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
            .Validate <RemoveHeader>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveFooter>()
            .ParseBlock <RemoveHeader>();
            //.Show(Color.Yellow);

            pipeline.Done();
        }
示例#26
0
        public static void RemoveHeaderImage(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-img-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Orange))
            //.Validate<RemoveHeader>().ShowErrors(p => p.Show(Color.Orange))
            .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Orange))
            .ParseBlock <RemoveFooter>()
            //.ParseBlock<RemoveHeader>()
            .ParseBlock <RemoveHeaderImage>()
            .Show(Color.Yellow);

            pipeline.Done();
        }
示例#27
0
        public static void DetectInvisibleTable(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-table-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red))
            .ParseBlock <RemoveOverlapedImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            .Show(Color.Yellow)
            .ParseBlock <DetectImplicitTable>()
            .Show(Color.Green)
            .Validate <DetectImplicitTable>().ShowErrors(p => p.Show(Color.Red))
            .ShowLine(Color.Black);

            pipeline.Done();
        }