Exemple #1
0
        public static void ResizeBlocksets(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-table-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            .ParseBlock <BreakColumns>()
            .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
            .Validate <RemoveHeader>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveFooter>()
            .ParseBlock <RemoveHeader>()
            .ParseBlock <AddTableSpace>()
            .ParseBlock <ResizeBlocksets>()
            .ParseBlock <OrderBlocksets>()
            .Show(Color.Orange)
            .ShowLine(Color.Black);

            pipeline.Done();
        }
        public static int Process(string basename, string inputfolder, string outputfolder)
        {
            //PdfReaderException.DisableWarnings();
            //PdfReaderException.ContinueOnException();

            var pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"{inputfolder}/{basename}.pdf")
                //.Output($"{outputfolder}/{basename}-output.pdf")
                .AllPagesExcept <CreateTextLines>(new int[] { }, page =>
                                                  page.ParsePdf <ProcessPdfValidation>()
                                                  //.Show(Color.White)
                                                  .ParseBlock <IdentifyValidationMarks>()
                                                  .PdfCheck <CheckNoBlockSetOverlap>(Color.Orange)
                                                  //.Show(Color.Blue)
                                                  ).ToList();

            pipeline.SaveOk($"{outputfolder}/{basename}-ok.pdf");
            int errors = pipeline.SaveErrors($"{outputfolder}/errors/{basename}-errors.pdf");

            pipeline.Done();

            return(errors);
        }
Exemple #3
0
        public static void ExtractPages(string basename, string outputname, IList <int> pages)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .ExtractPages($"bin/{outputname}.pdf", pages);
        }
Exemple #4
0
        public static void ValidateResizeBlock(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-test-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red))
            .ParseBlock <RemoveOverlapedImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .ParseBlock <FindInitialBlockset>()
            .ParseBlock <BreakColumns>()
            //.Validate<RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
            //.Validate<RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveFooter>()
            .ParseBlock <RemoveHeaderImage>()
            .ParseBlock <AddTableSpace>()
            .ParseBlock <AddImageSpace>()
            //.ParseBlock<BreakInlineElements>()
            .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red));

            pipeline.Done();
        }
Exemple #5
0
        public static void ExtractPage(string basename, int page)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Extract($"bin/{basename}-p{page}.pdf", page, page);
        }
        static PipelineText <TextLine> GetTextLines(Execution.Pipeline pipeline, string basename, string inputfolder, string outputfolder)
        {
            string inputfile  = $"{inputfolder}/{basename}.pdf";
            string outputfile = $"{outputfolder}/{basename}-parser.pdf";

            return(Examples.GetTextLines(pipeline, inputfile, outputfile));
        }
Exemple #7
0
        public static void CorrectOrder(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/09-orders-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .ParseBlock <RemoveOverlapedImages>()
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveSmallFonts>()
            .ParseBlock <MergeTableText>()
            .ParseBlock <HighlightTextTable>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveHeaderImage>()
            .ParseBlock <FindInitialBlocksetWithRewind>()
            .ParseBlock <BreakColumnsLight>()
            .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
            .ParseBlock <RemoveFooter>()
            .ParseBlock <AddTableSpace>()
            .ParseBlock <AddImageSpace>()
            .ParseBlock <BreakInlineElements>()
            .ParseBlock <ResizeBlocksets>()
            .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red))
            .ParseBlock <OrderBlocksets>()
            .Show(Color.Orange)
            .ShowLine(Color.Black);

            pipeline.Done();
        }
Exemple #8
0
        public static void ShowHeaderFooter(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-header-footer-output.pdf")
            .AllPages(page =>
            {
                page.ParsePdf <PreProcessTables>()
                .ParseBlock <IdentifyTables>()
                .Show(Color.Red)
                .ParsePdf <PreProcessImages>()
                .Show(Color.Green)
                .ParseBlock <RemoveOverlapedImages>()
                .ParsePdf <ProcessPdfText>()
                .ParseBlock <RemoveSmallFonts>()
                .ParseBlock <MergeTableText>()
                .ParseBlock <HighlightTextTable>()
                .ParseBlock <RemoveTableText>()
                .ParseBlock <GroupLines>()
                .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
                .ParseBlock <RemoveHeaderImage>()

                .ParseBlock <FindInitialBlocksetWithRewind>()
                .ParseBlock <BreakColumnsLight>()
                //.ParseBlock<BreakColumns>()
                .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
                .ParseBlock <RemoveFooter>();
            });

            pipeline.Done();
        }
Exemple #9
0
 static void ExtractPages2(string basename, string outputname, IList <int> pages)
 {
     using (var pipeline = new Execution.Pipeline())
     {
         pipeline.Input($"{basename}.pdf")
         .ExtractPages($"{outputname}.pdf", pages);
     }
 }
Exemple #10
0
        public static void MultipageCore(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-page-output.pdf")
            .AllPages(p => ProcessPage(p));

            pipeline.Done();
        }
Exemple #11
0
        public static IEnumerable <TextLine> GetEnumerableLines(string basename)
        {
            var pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"bin/{basename}.pdf")
                .Output($"bin/{basename}-test-output.pdf")
                .StreamConvert <CreateTextLines>(ProcessPage);

            return(result);
        }
Exemple #12
0
        public static PipelineText <TextLine> GetTextLinesWithPipeline(string basename, out Execution.Pipeline pipeline)
        {
            pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"bin/{basename}.pdf")
                .Output($"bin/{basename}-test-output.pdf")
                .AllPages <CreateTextLines>(ProcessPage);

            return(result);
        }
Exemple #13
0
        public static PipelineText <TextLine> GetTextLines(Execution.Pipeline pipeline, string inputname, string outputname)
        {
            var result =
                pipeline.Input(inputname)
                .Output(outputname)
                .AllPages <CreateTextLines>(page =>
                                            page.ParsePdf <PreProcessTables>()
                                            .ParseBlock <IdentifyTables>()        // 1
                                            .ParsePdf <PreProcessImages>()
                                            .ParseBlock <BasicFirstPageStats>()   // 2
                                            .ParseBlock <RemoveOverlapedImages>() // 3
                                            .ParsePdf <ProcessPdfText>()          // 4
                                            .ParseBlock <RemoveSmallFonts>()      // 5
                                            .ParseBlock <MergeTableText>()        // 6
                                            .ParseBlock <HighlightTextTable>()    // 7
                                            .ParseBlock <RemoveTableText>()       // 8
                                            .ParseBlock <ReplaceCharacters>()     // 9
                                            .ParseBlock <GroupLines>()            // 10
                                            .ParseBlock <RemoveTableDotChar>()    // 11
                                            .Show(Color.Yellow)
                                            .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
                                            .ParseBlock <RemoveHeaderImage>()             // 12
                                            .ParseBlock <FindInitialBlocksetWithRewind>() // 13
                                            .Show(Color.Gray)
                                            .ParseBlock <BreakColumnsLight>()             // 14
                                            .ParseBlock <AddTableSpace>()                 // 15
                                            .ParseBlock <RemoveTableOverImage>()          // 16
                                            .ParseBlock <RemoveImageTexts>()              // 17
                                            .ParseBlock <AddImageSpace>()                 // 18
                                            .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
                                            .ParseBlock <RemoveFooter>()                  // 19
                                            .ParseBlock <AddTableHorizontalLines>()       // 20
                                            .ParseBlock <RemoveBackgroundNonText>()       // 21
                                            .ParseBlock <BreakColumnsRewrite>()           // 22

                                            .ParseBlock <BreakInlineElements>()           // 23
                                            .ParseBlock <ResizeBlocksets>()               // 24
                                            .ParseBlock <ResizeBlocksetMagins>()          // 25
                                            .ParseBlock <OrderBlocksets>()                // 26

                                            .ParseBlock <OrganizePageLayout>()            // 27
                                            .ParseBlock <MergeSequentialLayout>()         // 28
                                            .ParseBlock <ResizeSequentialLayout>()        // 29
                                            .Show(Color.Orange)
                                            .ShowLine(Color.Black)
                                            .ParseBlock <CheckOverlap>()    // 30
                                            .Validate <CheckOverlap>().ShowErrors(p => p.Show(Color.Red))
                                            .Validate <ValidatePositiveCoordinates>().ShowErrors(p => p.Show(Color.Red))

                                            .PrintWarnings()
                                            );

            return(result);
        }
Exemple #14
0
        public static void MarkAllComponents(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tmp-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .Show(Color.Yellow);

            pipeline.Done();
        }
Exemple #15
0
        public static void Blocks(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/01-blocks-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .Show(Color.Orange);

            pipeline.Done();
        }
Exemple #16
0
        public static void ProcessImages(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-img-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessImages>()
            .Show(Color.Red);

            pipeline.Done();
        }
Exemple #17
0
        public static PipelineText <TextStructure> GetTextParagraphs(string basename)
        {
            var pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"bin/{basename}.pdf")
                .Output($"bin/{basename}-test-output.pdf")
                .AllPages <CreateTextLines>(ProcessPage)
                .ConvertText <CreateStructures, TextStructure>();

            return(result);
        }
Exemple #18
0
        public static void ShowRenderPath(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tmp-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessRenderPath>()
            .ShowLine(Color.Green);

            pipeline.Done();
        }
Exemple #19
0
        public static void GroupLines(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf").Page(1)
            .Output($"bin/{basename}-tmp-output.pdf")
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .Show(Color.Orange);

            pipeline.Done();
        }
Exemple #20
0
        public static void FollowText(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-follow-text-output.pdf")
            .AllPages(page => page
                      .ParsePdf <ProcessPdfText>()
                      .ShowLine(Color.Orange)
                      );

            pipeline.Done();
        }
Exemple #21
0
        public static void BlockLines(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/02-blockline-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .Show(Color.Red);

            pipeline.Done();
        }
Exemple #22
0
        public static void FindIds(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/10-ids-{basename}-output.pdf")
            .AllPages(page =>
            {
                page.ParsePdf <ProcessPdfText>()
                .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green));
            });

            pipeline.Done();
        }
Exemple #23
0
        public static void ShowTables(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-table-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .Show(Color.Yellow)
            .ParseBlock <IdentifyTables>()
            .Show(Color.Green);

            pipeline.Done();
        }
Exemple #24
0
        public static void ShowTables(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tables.pdf")
            .AllPages(page => page.ParsePdf <PreProcessTables>()
                      .ParseBlock <IdentifyTables>()
                      .Show(Color.Green)
                      .Validate <CheckOverlap>().ShowErrors(b => b.Show(Color.Red))
                      );

            pipeline.Done();
        }
Exemple #25
0
        public static void FollowText(IVirtualFS virtualFS, string basename)
        {
            VirtualFS.ConfigureFileSystem(virtualFS);

            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{basename}.pdf")
            .Output($"{basename}-follow-text-output.pdf")
            .AllPages(page => page
                      .ParsePdf <ProcessPdfText>()
                      .ShowLine(Color.Orange)
                      );

            pipeline.Done();
        }
Exemple #26
0
        public static void FollowLine(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{_inputFolder}/{basename}.pdf")
            .Output($"{pdfsDir}/04-followline-{basename}-output.pdf")
            .Page(1)
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <GroupLines>()
            .ShowLine(Color.Green)
            .ParseBlock <FindInitialBlockset>()
            .Show(Color.Orange);

            pipeline.Done();
        }
Exemple #27
0
        public static void RemoveOverlapedImages(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-tmp-output.pdf")
            .Page(1)
            //.ParsePdf<PreProcessTables>()
            //    .ParseBlock<IdentifyTables>()
            .ParsePdf <PreProcessImages>()
            .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red))
            .ParseBlock <RemoveOverlapedImages>()
            .Show(Color.Green);
            pipeline.Done();
        }
Exemple #28
0
        static PipelineText <TextLine> GetTextLines(string basename, string inputfolder, string outputfolder, out Execution.Pipeline pipeline)
        {
            pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"{inputfolder}/{basename}.pdf")
                .Output($"{outputfolder}/{basename}-output.pdf")
                .AllPagesExcept <CreateTextLines>(new int[] { }, page =>
                                                  page.ParsePdf <PreProcessTables>()
                                                  .ParseBlock <IdentifyTables>()
                                                  .ParsePdf <PreProcessImages>()
                                                  .ParseBlock <RemoveOverlapedImages>()
                                                  .ParsePdf <ProcessPdfText>()
                                                  .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green))
                                                  .ParseBlock <RemoveSmallFonts>()
                                                  .ParseBlock <MergeTableText>()
                                                  .ParseBlock <HighlightTextTable>()
                                                  .ParseBlock <RemoveTableText>()
                                                  .ParseBlock <ReplaceCharacters>()
                                                  .ParseBlock <GroupLines>()
                                                  .ParseBlock <RemoveTableDotChar>()
                                                  .Show(Color.Yellow)
                                                  .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
                                                  .ParseBlock <RemoveHeaderImage>()
                                                  .ParseBlock <FindInitialBlocksetWithRewind>()
                                                  .Show(Color.Gray)
                                                  .ParseBlock <BreakColumnsLight>()
                                                  //.ParseBlock<BreakColumns>()
                                                  .ParseBlock <AddTableSpace>()
                                                  .ParseBlock <RemoveTableOverImage>()
                                                  .ParseBlock <RemoveImageTexts>()
                                                  .ParseBlock <AddImageSpace>()
                                                  .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
                                                  .ParseBlock <RemoveFooter>()
                                                  .ParseBlock <BreakInlineElements>()
                                                  .ParseBlock <ResizeBlocksets>()
                                                  .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Gray))
                                                  .ParseBlock <OrderBlocksets>()
                                                  .Show(Color.Orange)
                                                  .ShowLine(Color.Black)
                                                  .ParseBlock <OrganizePageLayout>()
                                                  .ParseBlock <CheckOverlap>()
                                                  .Validate <ValidatePositiveCoordinates>().ShowErrors(p => p.Show(Color.Red))
                                                  );

            return(result);
        }
Exemple #29
0
        public static void RemoveTables(string basename)
        {
            var pipeline = new Execution.Pipeline();

            pipeline.Input($"bin/{basename}.pdf")
            .Output($"bin/{basename}-table-output.pdf")
            .Page(1)
            .ParsePdf <PreProcessTables>()
            .ParseBlock <IdentifyTables>()
            .Show(Color.Green)
            .ParsePdf <ProcessPdfText>()
            .ParseBlock <RemoveTableText>()
            .ParseBlock <GroupLines>()
            .Show(Color.Red);

            pipeline.Done();
        }
Exemple #30
0
        static PipelineText <TextLine> GetTextLinesWithPipelineBlockset(string basename, out Execution.Pipeline pipeline)
        {
            pipeline = new Execution.Pipeline();

            var result =
                pipeline.Input($"bin/{basename}.pdf")
                .Output($"bin/{basename}-test-output.pdf")
                .AllPagesExcept <CreateTextLines>(new int[] { }, page =>
                                                  page.ParsePdf <PreProcessTables>()
                                                  .ParseBlock <IdentifyTables>()
                                                  .ParsePdf <PreProcessImages>()
                                                  .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red))
                                                  .ParseBlock <RemoveOverlapedImages>()
                                                  .ParsePdf <ProcessPdfText>()
                                                  .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green))
                                                  .ParseBlock <RemoveSmallFonts>()
                                                  //.Validate<MergeTableText>().ShowErrors(p => p.Show(Color.Blue))
                                                  .ParseBlock <MergeTableText>()
                                                  //.Validate<HighlightTextTable>().ShowErrors(p => p.Show(Color.Green))
                                                  .ParseBlock <HighlightTextTable>()
                                                  .ParseBlock <RemoveTableText>()
                                                  .ParseBlock <GroupLines>()
                                                  .Show(Color.Yellow)
                                                  .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple))
                                                  .ParseBlock <RemoveHeaderImage>()

                                                  .ParseBlock <FindInitialBlocksetWithRewind>()
                                                  .Show(Color.Gray)
                                                  .ParseBlock <BreakColumnsLight>()
                                                  //.ParseBlock<BreakColumns>()
                                                  .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple))
                                                  .ParseBlock <RemoveFooter>()
                                                  .ParseBlock <AddTableSpace>()
                                                  .ParseBlock <AddImageSpace>()
                                                  .ParseBlock <BreakInlineElements>()
                                                  .ParseBlock <ResizeBlocksets>()
                                                  .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red))
                                                  .ParseBlock <OrderBlocksets>()
                                                  .Show(Color.Orange)
                                                  .ShowLine(Color.Black)
                                                  );

            return(result);
        }