public static void ResizeBlocksets(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-table-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() .ParseBlock <BreakColumns>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .Validate <RemoveHeader>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <RemoveHeader>() .ParseBlock <AddTableSpace>() .ParseBlock <ResizeBlocksets>() .ParseBlock <OrderBlocksets>() .Show(Color.Orange) .ShowLine(Color.Black); pipeline.Done(); }
public static int Process(string basename, string inputfolder, string outputfolder) { //PdfReaderException.DisableWarnings(); //PdfReaderException.ContinueOnException(); var pipeline = new Execution.Pipeline(); var result = pipeline.Input($"{inputfolder}/{basename}.pdf") //.Output($"{outputfolder}/{basename}-output.pdf") .AllPagesExcept <CreateTextLines>(new int[] { }, page => page.ParsePdf <ProcessPdfValidation>() //.Show(Color.White) .ParseBlock <IdentifyValidationMarks>() .PdfCheck <CheckNoBlockSetOverlap>(Color.Orange) //.Show(Color.Blue) ).ToList(); pipeline.SaveOk($"{outputfolder}/{basename}-ok.pdf"); int errors = pipeline.SaveErrors($"{outputfolder}/errors/{basename}-errors.pdf"); pipeline.Done(); return(errors); }
public static void ExtractPages(string basename, string outputname, IList <int> pages) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .ExtractPages($"bin/{outputname}.pdf", pages); }
public static void ValidateResizeBlock(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-test-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() .ParseBlock <BreakColumns>() //.Validate<RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) //.Validate<RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <RemoveHeaderImage>() .ParseBlock <AddTableSpace>() .ParseBlock <AddImageSpace>() //.ParseBlock<BreakInlineElements>() .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red)); pipeline.Done(); }
public static void ExtractPage(string basename, int page) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Extract($"bin/{basename}-p{page}.pdf", page, page); }
static PipelineText <TextLine> GetTextLines(Execution.Pipeline pipeline, string basename, string inputfolder, string outputfolder) { string inputfile = $"{inputfolder}/{basename}.pdf"; string outputfile = $"{outputfolder}/{basename}-parser.pdf"; return(Examples.GetTextLines(pipeline, inputfile, outputfile)); }
public static void CorrectOrder(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/09-orders-{basename}-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveSmallFonts>() .ParseBlock <MergeTableText>() .ParseBlock <HighlightTextTable>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() .ParseBlock <FindInitialBlocksetWithRewind>() .ParseBlock <BreakColumnsLight>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <AddTableSpace>() .ParseBlock <AddImageSpace>() .ParseBlock <BreakInlineElements>() .ParseBlock <ResizeBlocksets>() .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <OrderBlocksets>() .Show(Color.Orange) .ShowLine(Color.Black); pipeline.Done(); }
public static void ShowHeaderFooter(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-header-footer-output.pdf") .AllPages(page => { page.ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .Show(Color.Red) .ParsePdf <PreProcessImages>() .Show(Color.Green) .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveSmallFonts>() .ParseBlock <MergeTableText>() .ParseBlock <HighlightTextTable>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() .ParseBlock <FindInitialBlocksetWithRewind>() .ParseBlock <BreakColumnsLight>() //.ParseBlock<BreakColumns>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>(); }); pipeline.Done(); }
static void ExtractPages2(string basename, string outputname, IList <int> pages) { using (var pipeline = new Execution.Pipeline()) { pipeline.Input($"{basename}.pdf") .ExtractPages($"{outputname}.pdf", pages); } }
public static void MultipageCore(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-page-output.pdf") .AllPages(p => ProcessPage(p)); pipeline.Done(); }
public static IEnumerable <TextLine> GetEnumerableLines(string basename) { var pipeline = new Execution.Pipeline(); var result = pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-test-output.pdf") .StreamConvert <CreateTextLines>(ProcessPage); return(result); }
public static PipelineText <TextLine> GetTextLinesWithPipeline(string basename, out Execution.Pipeline pipeline) { pipeline = new Execution.Pipeline(); var result = pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-test-output.pdf") .AllPages <CreateTextLines>(ProcessPage); return(result); }
public static PipelineText <TextLine> GetTextLines(Execution.Pipeline pipeline, string inputname, string outputname) { var result = pipeline.Input(inputname) .Output(outputname) .AllPages <CreateTextLines>(page => page.ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() // 1 .ParsePdf <PreProcessImages>() .ParseBlock <BasicFirstPageStats>() // 2 .ParseBlock <RemoveOverlapedImages>() // 3 .ParsePdf <ProcessPdfText>() // 4 .ParseBlock <RemoveSmallFonts>() // 5 .ParseBlock <MergeTableText>() // 6 .ParseBlock <HighlightTextTable>() // 7 .ParseBlock <RemoveTableText>() // 8 .ParseBlock <ReplaceCharacters>() // 9 .ParseBlock <GroupLines>() // 10 .ParseBlock <RemoveTableDotChar>() // 11 .Show(Color.Yellow) .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() // 12 .ParseBlock <FindInitialBlocksetWithRewind>() // 13 .Show(Color.Gray) .ParseBlock <BreakColumnsLight>() // 14 .ParseBlock <AddTableSpace>() // 15 .ParseBlock <RemoveTableOverImage>() // 16 .ParseBlock <RemoveImageTexts>() // 17 .ParseBlock <AddImageSpace>() // 18 .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() // 19 .ParseBlock <AddTableHorizontalLines>() // 20 .ParseBlock <RemoveBackgroundNonText>() // 21 .ParseBlock <BreakColumnsRewrite>() // 22 .ParseBlock <BreakInlineElements>() // 23 .ParseBlock <ResizeBlocksets>() // 24 .ParseBlock <ResizeBlocksetMagins>() // 25 .ParseBlock <OrderBlocksets>() // 26 .ParseBlock <OrganizePageLayout>() // 27 .ParseBlock <MergeSequentialLayout>() // 28 .ParseBlock <ResizeSequentialLayout>() // 29 .Show(Color.Orange) .ShowLine(Color.Black) .ParseBlock <CheckOverlap>() // 30 .Validate <CheckOverlap>().ShowErrors(p => p.Show(Color.Red)) .Validate <ValidatePositiveCoordinates>().ShowErrors(p => p.Show(Color.Red)) .PrintWarnings() ); return(result); }
public static void MarkAllComponents(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tmp-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .Show(Color.Yellow); pipeline.Done(); }
public static void Blocks(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/01-blocks-{basename}-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .Show(Color.Orange); pipeline.Done(); }
public static void ProcessImages(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-img-output.pdf") .Page(1) .ParsePdf <PreProcessImages>() .Show(Color.Red); pipeline.Done(); }
public static PipelineText <TextStructure> GetTextParagraphs(string basename) { var pipeline = new Execution.Pipeline(); var result = pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-test-output.pdf") .AllPages <CreateTextLines>(ProcessPage) .ConvertText <CreateStructures, TextStructure>(); return(result); }
public static void ShowRenderPath(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tmp-output.pdf") .Page(1) .ParsePdf <PreProcessRenderPath>() .ShowLine(Color.Green); pipeline.Done(); }
public static void GroupLines(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf").Page(1) .Output($"bin/{basename}-tmp-output.pdf") .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .Show(Color.Orange); pipeline.Done(); }
public static void FollowText(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-follow-text-output.pdf") .AllPages(page => page .ParsePdf <ProcessPdfText>() .ShowLine(Color.Orange) ); pipeline.Done(); }
public static void BlockLines(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/02-blockline-{basename}-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .Show(Color.Red); pipeline.Done(); }
public static void FindIds(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/10-ids-{basename}-output.pdf") .AllPages(page => { page.ParsePdf <ProcessPdfText>() .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green)); }); pipeline.Done(); }
public static void ShowTables(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-table-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .Show(Color.Yellow) .ParseBlock <IdentifyTables>() .Show(Color.Green); pipeline.Done(); }
public static void ShowTables(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tables.pdf") .AllPages(page => page.ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .Show(Color.Green) .Validate <CheckOverlap>().ShowErrors(b => b.Show(Color.Red)) ); pipeline.Done(); }
public static void FollowText(IVirtualFS virtualFS, string basename) { VirtualFS.ConfigureFileSystem(virtualFS); var pipeline = new Execution.Pipeline(); pipeline.Input($"{basename}.pdf") .Output($"{basename}-follow-text-output.pdf") .AllPages(page => page .ParsePdf <ProcessPdfText>() .ShowLine(Color.Orange) ); pipeline.Done(); }
public static void FollowLine(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/04-followline-{basename}-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .ShowLine(Color.Green) .ParseBlock <FindInitialBlockset>() .Show(Color.Orange); pipeline.Done(); }
public static void RemoveOverlapedImages(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tmp-output.pdf") .Page(1) //.ParsePdf<PreProcessTables>() // .ParseBlock<IdentifyTables>() .ParsePdf <PreProcessImages>() .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <RemoveOverlapedImages>() .Show(Color.Green); pipeline.Done(); }
static PipelineText <TextLine> GetTextLines(string basename, string inputfolder, string outputfolder, out Execution.Pipeline pipeline) { pipeline = new Execution.Pipeline(); var result = pipeline.Input($"{inputfolder}/{basename}.pdf") .Output($"{outputfolder}/{basename}-output.pdf") .AllPagesExcept <CreateTextLines>(new int[] { }, page => page.ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green)) .ParseBlock <RemoveSmallFonts>() .ParseBlock <MergeTableText>() .ParseBlock <HighlightTextTable>() .ParseBlock <RemoveTableText>() .ParseBlock <ReplaceCharacters>() .ParseBlock <GroupLines>() .ParseBlock <RemoveTableDotChar>() .Show(Color.Yellow) .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() .ParseBlock <FindInitialBlocksetWithRewind>() .Show(Color.Gray) .ParseBlock <BreakColumnsLight>() //.ParseBlock<BreakColumns>() .ParseBlock <AddTableSpace>() .ParseBlock <RemoveTableOverImage>() .ParseBlock <RemoveImageTexts>() .ParseBlock <AddImageSpace>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <BreakInlineElements>() .ParseBlock <ResizeBlocksets>() .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Gray)) .ParseBlock <OrderBlocksets>() .Show(Color.Orange) .ShowLine(Color.Black) .ParseBlock <OrganizePageLayout>() .ParseBlock <CheckOverlap>() .Validate <ValidatePositiveCoordinates>().ShowErrors(p => p.Show(Color.Red)) ); return(result); }
public static void RemoveTables(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-table-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .Show(Color.Green) .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .Show(Color.Red); pipeline.Done(); }
static PipelineText <TextLine> GetTextLinesWithPipelineBlockset(string basename, out Execution.Pipeline pipeline) { pipeline = new Execution.Pipeline(); var result = pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-test-output.pdf") .AllPagesExcept <CreateTextLines>(new int[] { }, page => page.ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green)) .ParseBlock <RemoveSmallFonts>() //.Validate<MergeTableText>().ShowErrors(p => p.Show(Color.Blue)) .ParseBlock <MergeTableText>() //.Validate<HighlightTextTable>().ShowErrors(p => p.Show(Color.Green)) .ParseBlock <HighlightTextTable>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .Show(Color.Yellow) .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() .ParseBlock <FindInitialBlocksetWithRewind>() .Show(Color.Gray) .ParseBlock <BreakColumnsLight>() //.ParseBlock<BreakColumns>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <AddTableSpace>() .ParseBlock <AddImageSpace>() .ParseBlock <BreakInlineElements>() .ParseBlock <ResizeBlocksets>() .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <OrderBlocksets>() .Show(Color.Orange) .ShowLine(Color.Black) ); return(result); }