public static void RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder) { VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); Pipeline pipeline = new Pipeline(); var artigos = GetTextLines(pipeline, basename, inputfolder, outputfolder) .Log <AnalyzeLines>($"{outputfolder}/{basename}/lines.txt") .ConvertText <CreateTextLineIndex, TextLine>() .ConvertText <PreCreateStructures, TextLine2>() .ConvertText <CreateStructures2, TextStructure>() .ConvertText <PreCreateTextSegments, TextStructureAgg>() .ConvertText <AggregateStructures, TextStructure>() .ShowPdf <ShowStructureCentral>($"{outputfolder}/{basename}/show-central.pdf") .Log <AnalyzeStructures>($"{outputfolder}/{basename}/struct.txt") .Log <AnalyzeStructuresCentral>($"{outputfolder}/{basename}/central.txt") .ConvertText <CreateTextSegments, TextSegment>() .ConvertText <CreateTreeSegments, TextSegment>() .Log <AnalyzeSegmentTitles>($"{outputfolder}/{basename}/segment-titles-tree.txt") .Log <AnalyzeTreeStructure>(Console.Out) .ToList(); pipeline.ExtractOutput <ShowParserWarnings>($"{outputfolder}/{basename}/parser-errors.pdf"); }
public static void RunCreateArtigosJson(IVirtualFS virtualFS, string basename, string inputfolder, string tmpfolder, string outputfolder) { VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); Pipeline pipeline = new Pipeline(); var conteudo = GetTextLines(pipeline, basename, inputfolder, tmpfolder) // use temp folder .ConvertText <CreateTextLineIndex, TextLine>() .ConvertText <PreCreateStructures, TextLine2>() .ConvertText <CreateStructures2, TextStructure>() .ConvertText <PreCreateTextSegments, TextStructureAgg>() .ConvertText <AggregateStructures, TextStructure>() .ConvertText <CreateTextSegments, TextSegment>() .ConvertText <CreateTreeSegments, TextSegment>() .Log <AnalyzeSegmentTitles>($"{tmpfolder}/{basename}/segment-titles-tree.txt") .ConvertText <TransformConteudo, Conteudo>() .ToList(); var createArticle = new TransformArtigo(); var artigos = createArticle.Create(conteudo); createArticle.CreateJson(artigos, $"{outputfolder}/{basename}", basename); }
public static Dictionary <string, string> RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder) { VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); using (var context = new ParserStages.StageContext(basename, inputfolder, outputfolder)) { //var dbg0 = new ParserStages.StageDbgFlow(context); //dbg0.Process(); var extract = new ParserStages.StageExtractHeaderDOU(context); extract.Process(); var stage0 = new ParserStages.StagePdfInput(context); stage0.Process(); var stage1 = new ParserStages.StagePageMargins(context); stage1.Process(); var stage2 = new ParserStages.StageBlocksets(context); stage2.Process(); var stage3 = new ParserStages.StageRetrieveBlocks(context); stage3.Process(); var stageText1 = new ParserStages.StageConvertText(context); stageText1.Process(); var stageText2 = new ParserStages.StageConvertStructure(context); stageText2.Process(); //stageText2.ProcessWithConfiguration($"{outputfolder}/{basename}/{basename}-tree.config"); var stageText3 = new ParserStages.StageConvertStructText(context); stageText3.Process(); var stageTextTree = new ParserStages.StageConvertTree(context); stageTextTree.Process(); var stageContent = new ParserStages.StageConvertContent(context); stageContent.Process(); var stageArtigos = new ParserStages.StageConvertArtigoGN(context); stageArtigos.Process(); string logStage3 = context.GetOutput("stage3"); string logTree = context.GetOutput("tree"); return(context.FileListOutput); } }
public static void FollowText(IVirtualFS virtualFS, string basename) { VirtualFS.ConfigureFileSystem(virtualFS); var pipeline = new Execution.Pipeline(); pipeline.Input($"{basename}.pdf") .Output($"{basename}-follow-text-output.pdf") .AllPages(page => page .ParsePdf <ProcessPdfText>() .ShowLine(Color.Orange) ); pipeline.Done(); }
public static string ExtractHeader(string basename) { var virtualFS = new VirtualFS(); VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); using (var context = new ParserStages.StageContext(basename, "input", "output")) { var extract = new ParserStages.StageExtractHeaderDOU(context); extract.Process(); return(context.FileListOutput.ToString()); } }