public static void ValidateResizeBlock(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-test-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() .ParseBlock <BreakColumns>() //.Validate<RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) //.Validate<RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <RemoveHeaderImage>() .ParseBlock <AddTableSpace>() .ParseBlock <AddImageSpace>() //.ParseBlock<BreakInlineElements>() .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red)); pipeline.Done(); }
public static void ResizeBlocksets(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-table-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() .ParseBlock <BreakColumns>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .Validate <RemoveHeader>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <RemoveHeader>() .ParseBlock <AddTableSpace>() .ParseBlock <ResizeBlocksets>() .ParseBlock <OrderBlocksets>() .Show(Color.Orange) .ShowLine(Color.Black); pipeline.Done(); }
public static void CorrectOrder(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/09-orders-{basename}-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveSmallFonts>() .ParseBlock <MergeTableText>() .ParseBlock <HighlightTextTable>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() .ParseBlock <FindInitialBlocksetWithRewind>() .ParseBlock <BreakColumnsLight>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <AddTableSpace>() .ParseBlock <AddImageSpace>() .ParseBlock <BreakInlineElements>() .ParseBlock <ResizeBlocksets>() .Validate <ResizeBlocksets>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <OrderBlocksets>() .Show(Color.Orange) .ShowLine(Color.Black); pipeline.Done(); }
public static int Process(string basename, string inputfolder, string outputfolder) { //PdfReaderException.DisableWarnings(); //PdfReaderException.ContinueOnException(); var pipeline = new Execution.Pipeline(); var result = pipeline.Input($"{inputfolder}/{basename}.pdf") //.Output($"{outputfolder}/{basename}-output.pdf") .AllPagesExcept <CreateTextLines>(new int[] { }, page => page.ParsePdf <ProcessPdfValidation>() //.Show(Color.White) .ParseBlock <IdentifyValidationMarks>() .PdfCheck <CheckNoBlockSetOverlap>(Color.Orange) //.Show(Color.Blue) ).ToList(); pipeline.SaveOk($"{outputfolder}/{basename}-ok.pdf"); int errors = pipeline.SaveErrors($"{outputfolder}/errors/{basename}-errors.pdf"); pipeline.Done(); return(errors); }
public static void ShowHeaderFooter(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-header-footer-output.pdf") .AllPages(page => { page.ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .Show(Color.Red) .ParsePdf <PreProcessImages>() .Show(Color.Green) .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveSmallFonts>() .ParseBlock <MergeTableText>() .ParseBlock <HighlightTextTable>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() .ParseBlock <FindInitialBlocksetWithRewind>() .ParseBlock <BreakColumnsLight>() //.ParseBlock<BreakColumns>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>(); }); pipeline.Done(); }
public static void MultipageCore(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-page-output.pdf") .AllPages(p => ProcessPage(p)); pipeline.Done(); }
public static void MarkAllComponents(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tmp-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .Show(Color.Yellow); pipeline.Done(); }
public static void GroupLines(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf").Page(1) .Output($"bin/{basename}-tmp-output.pdf") .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .Show(Color.Orange); pipeline.Done(); }
public static void Blocks(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/01-blocks-{basename}-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .Show(Color.Orange); pipeline.Done(); }
public static void ShowRenderPath(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tmp-output.pdf") .Page(1) .ParsePdf <PreProcessRenderPath>() .ShowLine(Color.Green); pipeline.Done(); }
public static void ProcessImages(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-img-output.pdf") .Page(1) .ParsePdf <PreProcessImages>() .Show(Color.Red); pipeline.Done(); }
public static void FollowText(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-follow-text-output.pdf") .AllPages(page => page .ParsePdf <ProcessPdfText>() .ShowLine(Color.Orange) ); pipeline.Done(); }
public static void BlockLines(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/02-blockline-{basename}-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .Show(Color.Red); pipeline.Done(); }
public static void FindIds(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/10-ids-{basename}-output.pdf") .AllPages(page => { page.ParsePdf <ProcessPdfText>() .Validate <RemoveSmallFonts>().ShowErrors(p => p.ShowText(Color.Green)); }); pipeline.Done(); }
public static void ShowTables(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tables.pdf") .AllPages(page => page.ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .Show(Color.Green) .Validate <CheckOverlap>().ShowErrors(b => b.Show(Color.Red)) ); pipeline.Done(); }
public static void ShowTables(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-table-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .Show(Color.Yellow) .ParseBlock <IdentifyTables>() .Show(Color.Green); pipeline.Done(); }
public static void RemoveOverlapedImages(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-tmp-output.pdf") .Page(1) //.ParsePdf<PreProcessTables>() // .ParseBlock<IdentifyTables>() .ParsePdf <PreProcessImages>() .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <RemoveOverlapedImages>() .Show(Color.Green); pipeline.Done(); }
public static void FollowLine(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/04-followline-{basename}-output.pdf") .Page(1) .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .ShowLine(Color.Green) .ParseBlock <FindInitialBlockset>() .Show(Color.Orange); pipeline.Done(); }
public static void FollowText(IVirtualFS virtualFS, string basename) { VirtualFS.ConfigureFileSystem(virtualFS); var pipeline = new Execution.Pipeline(); pipeline.Input($"{basename}.pdf") .Output($"{basename}-follow-text-output.pdf") .AllPages(page => page .ParsePdf <ProcessPdfText>() .ShowLine(Color.Orange) ); pipeline.Done(); }
public static void RemoveTables(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-table-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .Show(Color.Green) .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .Show(Color.Red); pipeline.Done(); }
public static void ProcessPage1(string basename, string inputfolder, string outputfolder) { //PdfReaderException.DisableWarnings(); //PdfReaderException.ContinueOnException(); var pipeline = new Execution.Pipeline(); var result = pipeline.Input($"{inputfolder}/{basename}") .Output($"{outputfolder}/{basename}-invalid.pdf") .Page(1) .ParsePdf <ProcessPdfValidation>() .Show(Color.White) .ParseBlock <IdentifyValidationMarks>() .ParseBlock <CheckNoBlockSetOverlap>() .Show(Color.Blue); pipeline.Done(); }
public static void AddImageSpace(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-img-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() .ParseBlock <AddTableSpace>() .ParseBlock <AddImageSpace>() .Show(Color.Orange); pipeline.Done(); }
public static void BreakColumn(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"{_inputFolder}/{basename}.pdf") .Output($"{pdfsDir}/05-breakcolumn-{basename}-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveHeaderImage>() .ParseBlock <FindInitialBlocksetWithRewind>() .ParseBlock <BreakColumnsLight>() .Show(Color.Orange); pipeline.Done(); }
public static void Process(string basename, string inputfolder, string outputfolder) { BasicFirstPageStats.Reset(); PdfReaderException.ContinueOnException(); var pipeline = new Execution.Pipeline(); var artigos = GetTextLines(pipeline, basename, inputfolder, outputfolder) .ConvertText <CreateTextLineIndex, TextLine>() .ConvertText <PreCreateStructures, TextLine2>() .ConvertText <CreateStructures2, TextStructure>() .ConvertText <PreCreateTextSegments, TextStructureAgg>() .ConvertText <AggregateStructures, TextStructure>() .ShowPdf <ShowStructureCentral>($"{outputfolder}/{basename}-show-central.pdf") .ConvertText <CreateTextSegments, TextSegment>() .ConvertText <CreateTreeSegments, TextSegment>() .Log <AnalyzeTreeStructure>($"{outputfolder}/{basename}-tree.txt") .ToList(); pipeline.ExtractOutput <ShowParserWarnings>($"{outputfolder}/errors/{basename}-parser-errors.pdf"); pipeline.Done(); }
//public static void ValidateBreakColumns(string basename) //{ // var pipeline = new Execution.Pipeline(); // pipeline.Input($"bin/{basename}.pdf").Page(1) // .Output($"bin/{basename}-tmp-output.pdf") // .ParsePdf<ProcessPdfText>() // .ParseBlock<GroupLines>() // .ParseBlock<FindInitialBlockset>() // .Validate<BreakColumns>() // .ShowErrors(p => p.Show(Color.Purple)); // pipeline.Done(); //} //public static void BreakColumns(string basename) //{ // var pipeline = new Execution.Pipeline(); // pipeline.Input($"bin/{basename}.pdf").Page(1) // .Output($"bin/{basename}-tmp-output.pdf") // .ParsePdf<ProcessPdfText>() // .ParseBlock<GroupLines>() // .ParseBlock<FindInitialBlockset>() // .Validate<BreakColumns>().ShowErrors(p => p.Show(Color.LightGray)) // .ParseBlock<BreakColumns>() // .Show(Color.Green) // .Validate<BreakColumns>().ShowErrors(p => p.Show(Color.Red)); // pipeline.Done(); //} //public static void RemoveHeaderFooter(string basename) //{ // var pipeline = new Execution.Pipeline(); // pipeline.Input($"bin/{basename}.pdf").Page(1) // .Output($"bin/{basename}-tmp-output.pdf") // .ParsePdf<ProcessPdfText>() // .ParseBlock<GroupLines>() // .ParseBlock<FindInitialBlockset>() // .ParseBlock<BreakColumns>() // .Validate<RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) // .Validate<RemoveHeader>().ShowErrors(p => p.Show(Color.Purple)) // .ParseBlock<RemoveFooter>() // .ParseBlock<RemoveHeader>() // .Show(Color.Yellow); // pipeline.Done(); //} public static void MergeBlockLines(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf").Page(1) .Output($"bin/{basename}-tmp-output.pdf") .ParsePdf <ProcessPdfText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() //.ParseBlock<TestSplitBlocksets>() //.Show(Color.Red) .ShowLine(Color.Gray) .ParseBlock <MergeBlockLines>() .Show(Color.Green) //.ParseBlock<BreakColumns>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Purple)) .Validate <RemoveHeader>().ShowErrors(p => p.Show(Color.Purple)) .ParseBlock <RemoveFooter>() .ParseBlock <RemoveHeader>(); //.Show(Color.Yellow); pipeline.Done(); }
public static void RemoveHeaderImage(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-img-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() .Validate <RemoveFooter>().ShowErrors(p => p.Show(Color.Orange)) //.Validate<RemoveHeader>().ShowErrors(p => p.Show(Color.Orange)) .Validate <RemoveHeaderImage>().ShowErrors(p => p.Show(Color.Orange)) .ParseBlock <RemoveFooter>() //.ParseBlock<RemoveHeader>() .ParseBlock <RemoveHeaderImage>() .Show(Color.Yellow); pipeline.Done(); }
public static void DetectInvisibleTable(string basename) { var pipeline = new Execution.Pipeline(); pipeline.Input($"bin/{basename}.pdf") .Output($"bin/{basename}-table-output.pdf") .Page(1) .ParsePdf <PreProcessTables>() .ParseBlock <IdentifyTables>() .ParsePdf <PreProcessImages>() .Validate <RemoveOverlapedImages>().ShowErrors(p => p.Show(Color.Red)) .ParseBlock <RemoveOverlapedImages>() .ParsePdf <ProcessPdfText>() .ParseBlock <RemoveTableText>() .ParseBlock <GroupLines>() .ParseBlock <FindInitialBlockset>() .Show(Color.Yellow) .ParseBlock <DetectImplicitTable>() .Show(Color.Green) .Validate <DetectImplicitTable>().ShowErrors(p => p.Show(Color.Red)) .ShowLine(Color.Black); pipeline.Done(); }