public static void RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder) { VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); Pipeline pipeline = new Pipeline(); var artigos = GetTextLines(pipeline, basename, inputfolder, outputfolder) .Log <AnalyzeLines>($"{outputfolder}/{basename}/lines.txt") .ConvertText <CreateTextLineIndex, TextLine>() .ConvertText <PreCreateStructures, TextLine2>() .ConvertText <CreateStructures2, TextStructure>() .ConvertText <PreCreateTextSegments, TextStructureAgg>() .ConvertText <AggregateStructures, TextStructure>() .ShowPdf <ShowStructureCentral>($"{outputfolder}/{basename}/show-central.pdf") .Log <AnalyzeStructures>($"{outputfolder}/{basename}/struct.txt") .Log <AnalyzeStructuresCentral>($"{outputfolder}/{basename}/central.txt") .ConvertText <CreateTextSegments, TextSegment>() .ConvertText <CreateTreeSegments, TextSegment>() .Log <AnalyzeSegmentTitles>($"{outputfolder}/{basename}/segment-titles-tree.txt") .Log <AnalyzeTreeStructure>(Console.Out) .ToList(); pipeline.ExtractOutput <ShowParserWarnings>($"{outputfolder}/{basename}/parser-errors.pdf"); }
public static void RunCreateArtigosJson(IVirtualFS virtualFS, string basename, string inputfolder, string tmpfolder, string outputfolder) { VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); Pipeline pipeline = new Pipeline(); var conteudo = GetTextLines(pipeline, basename, inputfolder, tmpfolder) // use temp folder .ConvertText <CreateTextLineIndex, TextLine>() .ConvertText <PreCreateStructures, TextLine2>() .ConvertText <CreateStructures2, TextStructure>() .ConvertText <PreCreateTextSegments, TextStructureAgg>() .ConvertText <AggregateStructures, TextStructure>() .ConvertText <CreateTextSegments, TextSegment>() .ConvertText <CreateTreeSegments, TextSegment>() .Log <AnalyzeSegmentTitles>($"{tmpfolder}/{basename}/segment-titles-tree.txt") .ConvertText <TransformConteudo, Conteudo>() .ToList(); var createArticle = new TransformArtigo(); var artigos = createArticle.Create(conteudo); createArticle.CreateJson(artigos, $"{outputfolder}/{basename}", basename); }
public static void ConfigureFileSystem(IVirtualFS virtualFS) { if (virtualFS == null) { throw new ArgumentNullException(nameof(IVirtualFS)); } g_vfs = virtualFS; }
public PdfHandler(IVirtualFS virtualFS) { if (virtualFS == null) { throw new ArgumentNullException(nameof(virtualFS)); } this._virtualFS = virtualFS; }
public static Dictionary <string, string> RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder) { VirtualFS.ConfigureFileSystem(virtualFS); PdfReaderException.ContinueOnException(); using (var context = new ParserStages.StageContext(basename, inputfolder, outputfolder)) { //var dbg0 = new ParserStages.StageDbgFlow(context); //dbg0.Process(); var extract = new ParserStages.StageExtractHeaderDOU(context); extract.Process(); var stage0 = new ParserStages.StagePdfInput(context); stage0.Process(); var stage1 = new ParserStages.StagePageMargins(context); stage1.Process(); var stage2 = new ParserStages.StageBlocksets(context); stage2.Process(); var stage3 = new ParserStages.StageRetrieveBlocks(context); stage3.Process(); var stageText1 = new ParserStages.StageConvertText(context); stageText1.Process(); var stageText2 = new ParserStages.StageConvertStructure(context); stageText2.Process(); //stageText2.ProcessWithConfiguration($"{outputfolder}/{basename}/{basename}-tree.config"); var stageText3 = new ParserStages.StageConvertStructText(context); stageText3.Process(); var stageTextTree = new ParserStages.StageConvertTree(context); stageTextTree.Process(); var stageContent = new ParserStages.StageConvertContent(context); stageContent.Process(); var stageArtigos = new ParserStages.StageConvertArtigoGN(context); stageArtigos.Process(); string logStage3 = context.GetOutput("stage3"); string logTree = context.GetOutput("tree"); return(context.FileListOutput); } }
public static void FollowText(IVirtualFS virtualFS, string basename) { VirtualFS.ConfigureFileSystem(virtualFS); var pipeline = new Execution.Pipeline(); pipeline.Input($"{basename}.pdf") .Output($"{basename}-follow-text-output.pdf") .AllPages(page => page .ParsePdf <ProcessPdfText>() .ShowLine(Color.Orange) ); pipeline.Done(); }
public OutputFiles(IVirtualFS virtualFS) { this._webFs = virtualFS; }
public PdfProcessor(IVirtualFS vfs) { _vfs = vfs; }