예제 #1
0
        public static void RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder)
        {
            VirtualFS.ConfigureFileSystem(virtualFS);

            PdfReaderException.ContinueOnException();

            Pipeline pipeline = new Pipeline();

            var artigos = GetTextLines(pipeline, basename, inputfolder, outputfolder)
                          .Log <AnalyzeLines>($"{outputfolder}/{basename}/lines.txt")
                          .ConvertText <CreateTextLineIndex, TextLine>()
                          .ConvertText <PreCreateStructures, TextLine2>()
                          .ConvertText <CreateStructures2, TextStructure>()
                          .ConvertText <PreCreateTextSegments, TextStructureAgg>()
                          .ConvertText <AggregateStructures, TextStructure>()
                          .ShowPdf <ShowStructureCentral>($"{outputfolder}/{basename}/show-central.pdf")
                          .Log <AnalyzeStructures>($"{outputfolder}/{basename}/struct.txt")
                          .Log <AnalyzeStructuresCentral>($"{outputfolder}/{basename}/central.txt")
                          .ConvertText <CreateTextSegments, TextSegment>()
                          .ConvertText <CreateTreeSegments, TextSegment>()
                          .Log <AnalyzeSegmentTitles>($"{outputfolder}/{basename}/segment-titles-tree.txt")
                          .Log <AnalyzeTreeStructure>(Console.Out)
                          .ToList();

            pipeline.ExtractOutput <ShowParserWarnings>($"{outputfolder}/{basename}/parser-errors.pdf");
        }
예제 #2
0
        public static void RunCreateArtigosJson(IVirtualFS virtualFS, string basename, string inputfolder, string tmpfolder, string outputfolder)
        {
            VirtualFS.ConfigureFileSystem(virtualFS);

            PdfReaderException.ContinueOnException();

            Pipeline pipeline = new Pipeline();

            var conteudo = GetTextLines(pipeline, basename, inputfolder, tmpfolder) // use temp folder
                           .ConvertText <CreateTextLineIndex, TextLine>()
                           .ConvertText <PreCreateStructures, TextLine2>()
                           .ConvertText <CreateStructures2, TextStructure>()
                           .ConvertText <PreCreateTextSegments, TextStructureAgg>()
                           .ConvertText <AggregateStructures, TextStructure>()
                           .ConvertText <CreateTextSegments, TextSegment>()
                           .ConvertText <CreateTreeSegments, TextSegment>()
                           .Log <AnalyzeSegmentTitles>($"{tmpfolder}/{basename}/segment-titles-tree.txt")
                           .ConvertText <TransformConteudo, Conteudo>()
                           .ToList();

            var createArticle = new TransformArtigo();
            var artigos       = createArticle.Create(conteudo);

            createArticle.CreateJson(artigos, $"{outputfolder}/{basename}", basename);
        }
예제 #3
0
        public static Dictionary <string, string> RunParserPDF(IVirtualFS virtualFS, string basename, string inputfolder, string outputfolder)
        {
            VirtualFS.ConfigureFileSystem(virtualFS);

            PdfReaderException.ContinueOnException();

            using (var context = new ParserStages.StageContext(basename, inputfolder, outputfolder))
            {
                //var dbg0 = new ParserStages.StageDbgFlow(context);
                //dbg0.Process();

                var extract = new ParserStages.StageExtractHeaderDOU(context);
                extract.Process();

                var stage0 = new ParserStages.StagePdfInput(context);
                stage0.Process();

                var stage1 = new ParserStages.StagePageMargins(context);
                stage1.Process();

                var stage2 = new ParserStages.StageBlocksets(context);
                stage2.Process();

                var stage3 = new ParserStages.StageRetrieveBlocks(context);
                stage3.Process();

                var stageText1 = new ParserStages.StageConvertText(context);
                stageText1.Process();

                var stageText2 = new ParserStages.StageConvertStructure(context);
                stageText2.Process();
                //stageText2.ProcessWithConfiguration($"{outputfolder}/{basename}/{basename}-tree.config");

                var stageText3 = new ParserStages.StageConvertStructText(context);
                stageText3.Process();

                var stageTextTree = new ParserStages.StageConvertTree(context);
                stageTextTree.Process();

                var stageContent = new ParserStages.StageConvertContent(context);
                stageContent.Process();

                var stageArtigos = new ParserStages.StageConvertArtigoGN(context);
                stageArtigos.Process();

                string logStage3 = context.GetOutput("stage3");
                string logTree   = context.GetOutput("tree");

                return(context.FileListOutput);
            }
        }
예제 #4
0
        public static void FollowText(IVirtualFS virtualFS, string basename)
        {
            VirtualFS.ConfigureFileSystem(virtualFS);

            var pipeline = new Execution.Pipeline();

            pipeline.Input($"{basename}.pdf")
            .Output($"{basename}-follow-text-output.pdf")
            .AllPages(page => page
                      .ParsePdf <ProcessPdfText>()
                      .ShowLine(Color.Orange)
                      );

            pipeline.Done();
        }
예제 #5
0
        public static string ExtractHeader(string basename)
        {
            var virtualFS = new VirtualFS();

            VirtualFS.ConfigureFileSystem(virtualFS);

            PdfReaderException.ContinueOnException();

            using (var context = new ParserStages.StageContext(basename, "input", "output"))
            {
                var extract = new ParserStages.StageExtractHeaderDOU(context);
                extract.Process();

                return(context.FileListOutput.ToString());
            }
        }