예제 #1
0
        public static void Run()
        {
            // ExStart:ExtractText
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text();
            // Open input PDF
            PdfExtractor pdfExtractor = new PdfExtractor();

            pdfExtractor.BindPdf(dataDir + "ExtractText.pdf");

            // Use parameterless ExtractText method
            pdfExtractor.ExtractText();

            MemoryStream tempMemoryStream = new MemoryStream();

            pdfExtractor.GetText(tempMemoryStream);

            string text = "";

            // Specify Unicode encoding type in StreamReader constructor
            using (StreamReader streamReader = new StreamReader(tempMemoryStream, Encoding.Unicode))
            {
                streamReader.BaseStream.Seek(0, SeekOrigin.Begin);
                text = streamReader.ReadToEnd();
            }

            File.WriteAllText(dataDir + "output_out.txt", text);
            // ExEnd:ExtractText
        }
        public static void Run()
        {
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text();
            //open input PDF
            PdfExtractor pdfExtractor = new PdfExtractor();
            pdfExtractor.BindPdf(dataDir+ "ExtractText-PageRange.pdf");

            //specify start and end pages
            pdfExtractor.StartPage = 1;
            pdfExtractor.EndPage = 1;

            //use parameterless ExtractText method
            pdfExtractor.ExtractText();

            MemoryStream tempMemoryStream = new MemoryStream();
            pdfExtractor.GetText(tempMemoryStream);

            string text = "";
            //specify Unicode encoding type in StreamReader constructor
            using (StreamReader sr = new StreamReader(tempMemoryStream,Encoding.Unicode))
            {
                sr.BaseStream.Seek(0, SeekOrigin.Begin);
                text = sr.ReadToEnd();
            }

            File.WriteAllText(dataDir+ "output.txt", text);
 
            
            
        }
예제 #3
0
        public static void Main()
        {
            // The path to the documents directory.
            string dataDir = Path.GetFullPath("../../../Data/");
            //open input PDF
            PdfExtractor pdfExtractor = new PdfExtractor();
            pdfExtractor.BindPdf(dataDir+ "input.pdf");

            //use parameterless ExtractText method
            pdfExtractor.ExtractText();

            int pageNumber = 1;

            while (pdfExtractor.HasNextPageText())
            {
                MemoryStream tempMemoryStream = new MemoryStream();
                pdfExtractor.GetNextPageText(tempMemoryStream);
                string text = "";
                //specify Unicode encoding type in StreamReader constructor
                using (StreamReader streamReader = new
                StreamReader(tempMemoryStream, Encoding.Unicode))
                {
                    streamReader.BaseStream.Seek(0, SeekOrigin.Begin);
                    text = streamReader.ReadToEnd();
                }
                File.WriteAllText(dataDir+ "output" + pageNumber + ".txt", text);
                pageNumber++;
            }
        }
예제 #4
0
        private void ParsePDF(ref FileObject fo, string filePath)
        {
            Aspose.Pdf.Document    pdfDocument = new Aspose.Pdf.Document(filePath);
            PdfFileInfo            pi          = new PdfFileInfo(pdfDocument);
            PdfExtractor           pe          = new PdfExtractor(pdfDocument);
            ImagePlacementAbsorber abs         = new ImagePlacementAbsorber();

            fo.pageCount         = pi.NumberOfPages;
            fo.embeddedDocsCount = pdfDocument.EmbeddedFiles.Count;
            pdfDocument.Pages.Accept(abs);
            fo.imageCount  = abs.ImagePlacements.Count;
            fo.hasPassword = pi.HasOpenPassword;
            pe.ExtractText(Encoding.ASCII);
            string tmpFolderToExtract = tmpFolder + "\\" + Guid.NewGuid();

            Directory.CreateDirectory(tmpFolderToExtract);
            string tmpTextFile = tmpFolderToExtract + "\\" + "tmpTextexport.txt";

            pe.GetText(tmpTextFile);
            fo.wordCount      = GetWordCount(tmpTextFile);
            fo.characterCount = GetCharCount(tmpTextFile);
            if (File.Exists(tmpTextFile))
            {
                File.Delete(tmpTextFile);
            }
            if (Directory.Exists(tmpFolderToExtract))
            {
                Directory.Delete(tmpFolderToExtract);
            }
        }
예제 #5
0
        public static void Main()
        {
            // The path to the documents directory.
            string dataDir = Path.GetFullPath("../../../Data/");
            //open input PDF
            PdfExtractor pdfExtractor = new PdfExtractor();

            pdfExtractor.BindPdf(dataDir + "input.pdf");

            //specify start and end pages
            pdfExtractor.StartPage = 1;
            pdfExtractor.EndPage   = 1;

            //use parameterless ExtractText method
            pdfExtractor.ExtractText();

            MemoryStream tempMemoryStream = new MemoryStream();

            pdfExtractor.GetText(tempMemoryStream);

            string text = "";

            //specify Unicode encoding type in StreamReader constructor
            using (StreamReader sr = new StreamReader(tempMemoryStream, Encoding.Unicode))
            {
                sr.BaseStream.Seek(0, SeekOrigin.Begin);
                text = sr.ReadToEnd();
            }

            File.WriteAllText(dataDir + "output.txt", text);
        }
        public static void Run()
        {
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text();
            //open input PDF
            PdfExtractor pdfExtractor = new PdfExtractor();

            pdfExtractor.BindPdf(dataDir + "ExtractText-Page.pdf");

            //use parameterless ExtractText method
            pdfExtractor.ExtractText();

            int pageNumber = 1;

            while (pdfExtractor.HasNextPageText())
            {
                MemoryStream tempMemoryStream = new MemoryStream();
                pdfExtractor.GetNextPageText(tempMemoryStream);
                string text = "";
                //specify Unicode encoding type in StreamReader constructor
                using (StreamReader streamReader = new
                                                   StreamReader(tempMemoryStream, Encoding.Unicode))
                {
                    streamReader.BaseStream.Seek(0, SeekOrigin.Begin);
                    text = streamReader.ReadToEnd();
                }
                File.WriteAllText(dataDir + "output" + pageNumber + ".txt", text);
                pageNumber++;
            }
        }
        public static void Run()
        {
            // ExStart:ExtractTextPage
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text();
            // Open input PDF
            PdfExtractor pdfExtractor = new PdfExtractor();
            pdfExtractor.BindPdf(dataDir+ "ExtractText-Page.pdf");

            // Use parameterless ExtractText method
            pdfExtractor.ExtractText();

            int pageNumber = 1;

            while (pdfExtractor.HasNextPageText())
            {
                MemoryStream tempMemoryStream = new MemoryStream();
                pdfExtractor.GetNextPageText(tempMemoryStream);
                string text = "";
                // Specify Unicode encoding type in StreamReader constructor
                using (StreamReader streamReader = new
                StreamReader(tempMemoryStream, Encoding.Unicode))
                {
                    streamReader.BaseStream.Seek(0, SeekOrigin.Begin);
                    text = streamReader.ReadToEnd();
                }
                File.WriteAllText(dataDir+ "output" + pageNumber + "_out.txt", text);
                pageNumber++;
            }
            // ExEnd:ExtractTextPage
        }
        public static void Run()
        {
            // ExStart:PdfContainsTextOrImages
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles();

            // Instantiate a memoryStream object to hold the extracted text from Document
            MemoryStream ms = new MemoryStream();
            // Instantiate PdfExtractor object
            PdfExtractor extractor = new PdfExtractor();

            // Bind the input PDF document to extractor
            extractor.BindPdf(dataDir + "FilledForm.pdf");
            // Extract text from the input PDF document
            extractor.ExtractText();

            bool containsText  = false;
            bool containsImage = false;

            // Save the extracted text to a text file
            extractor.GetText(ms);
            // Check if the MemoryStream length is greater than or equal to 1
            if (ms.Length >= 1)
            {
                containsText = true;
            }

            // Extract images from the input PDF document
            extractor.ExtractImage();

            // Calling HasNextImage method in while loop. When images will finish, loop will exit
            if (extractor.HasNextImage())
            {
                containsImage = true;
            }

            // Now find out whether this PDF is text only or image only
            if (containsText == true && containsImage == false)
            {
                Console.WriteLine("PDF contains text only");
            }
            else if (containsText == false && containsImage == true)
            {
                Console.WriteLine("PDF contains image only");
            }
            else if (containsText == true && containsImage == true)
            {
                Console.WriteLine("PDF contains both text and image");
            }
            else if (containsText == false && containsImage == false)
            {
                Console.WriteLine("PDF contains neither text or nor image");
            }
            // ExEnd:PdfContainsTextOrImages
        }
예제 #9
0
        public void PdfExtractTextRpdTest(string path, string plus, string code)
        {
            var pdfExtractor = new PdfExtractor(new ContentImageExtractor());
            var rdpExtractor = new RpdContentExtractor(new RpdExtractorConfig(new List <string> {
                plus
            }, new List <string>(), @"(?<code>\d\d\.0\d\.\d\d)($|\D)"));
            var bytes = File.ReadAllBytes(path);

            var content = pdfExtractor.ExtractText(bytes, ".pdf");
            var extract = rdpExtractor.Extract(content.Content);

            Assert.True(extract.Codes.Count > 0);
            Assert.AreEqual(code, extract.Codes.First());
            Assert.AreEqual(DocumentType.Rpd, extract.DocumentType);
        }
        public static void Run()
        {
            // ExStart:PdfContainsTextOrImages
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles();

            // Instantiate a memoryStream object to hold the extracted text from Document
            MemoryStream ms = new MemoryStream();
            // Instantiate PdfExtractor object
            PdfExtractor extractor = new PdfExtractor();

            // Bind the input PDF document to extractor
            extractor.BindPdf(dataDir + "FilledForm.pdf");
            // Extract text from the input PDF document
            extractor.ExtractText();

            bool containsText = false;
            bool containsImage = false;
            // Save the extracted text to a text file
            extractor.GetText(ms);
            // Check if the MemoryStream length is greater than or equal to 1
            if (ms.Length >= 1)
                containsText = true;

            // Extract images from the input PDF document
            extractor.ExtractImage();

            // Calling HasNextImage method in while loop. When images will finish, loop will exit
            if (extractor.HasNextImage())
                containsImage = true;

            // Now find out whether this PDF is text only or image only
            if (containsText == true && containsImage == false)
                Console.WriteLine("PDF contains text only");
            else if (containsText == false && containsImage == true)
                Console.WriteLine("PDF contains image only");
            else if (containsText == true && containsImage == true)
                Console.WriteLine("PDF contains both text and image");
            else if (containsText == false && containsImage == false)
                Console.WriteLine("PDF contains neither text or nor image");
            // ExEnd:PdfContainsTextOrImages                      
        }
        public static void Run()
        {
            // ExStart:PdfExtractorFeatures
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles();

            // Create an instance of PdfExtractor class
            PdfExtractor extractor = new PdfExtractor();

            // Set PDF file password
            extractor.Password = "";
            // Specify start and end pages of the PDF
            extractor.StartPage = 1;
            extractor.EndPage   = 10;

            // Bind PDF file with the extractor object
            extractor.BindPdf(dataDir + "inFile.pdf");
            // Extract all text from the PDF
            extractor.ExtractText();
            // Save extracted text in a text file
            extractor.GetText(dataDir + "PdfExtractorFeatures_text_out_.txt");

            // Text of individual pages can also be saved individually in single text files
            if (extractor.HasNextPageText())
            {
                extractor.GetNextPageText(dataDir + DateTime.Now.Ticks.ToString() + "_out_.txt");
            }

            // Extract images from PDF file
            extractor.ExtractImage();
            // Save each individual image in an image file
            if (extractor.HasNextImage())
            {
                extractor.GetNextImage(dataDir + DateTime.Now.Ticks.ToString() + "_out_.jpg", System.Drawing.Imaging.ImageFormat.Jpeg);
            }

            // Extract attachments
            extractor.ExtractAttachment();
            extractor.GetAttachment(dataDir);
            // ExEnd:PdfExtractorFeatures
        }
        public static void Run()
        {
            // ExStart:PdfExtractorFeatures
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles();

            // Create an instance of PdfExtractor class
            PdfExtractor extractor = new PdfExtractor();

            // Set PDF file password
            extractor.Password = "";
            // Specify start and end pages of the PDF
            extractor.StartPage = 1;
            extractor.EndPage = 10;

            // Bind PDF file with the extractor object
            extractor.BindPdf( dataDir +  "inFile.pdf");
            // Extract all text from the PDF
            extractor.ExtractText();
            // Save extracted text in a text file
            extractor.GetText(dataDir + "PdfExtractorFeatures_text_out.txt");

            // Text of individual pages can also be saved individually in single text files
            if (extractor.HasNextPageText())
            {
                extractor.GetNextPageText(dataDir + DateTime.Now.Ticks.ToString() + "_out.txt");
            }

            // Extract images from PDF file
            extractor.ExtractImage();
            // Save each individual image in an image file
            if (extractor.HasNextImage())
            {
                extractor.GetNextImage(dataDir + DateTime.Now.Ticks.ToString() + "_out.jpg", System.Drawing.Imaging.ImageFormat.Jpeg);
            }

            // Extract attachments
            extractor.ExtractAttachment();           
            extractor.GetAttachment(dataDir);
            // ExEnd:PdfExtractorFeatures                      
        }
예제 #13
0
파일: Convert.cs 프로젝트: vaginessa/open
 private void pdf_to_txt(save_progress progress, System.Windows.Forms.Form dlg, string fileType)
 {
     try
     {
         Aspose.Pdf.Document document = null;
         int num = 0;
         if (fileType == ".pdf")
         {
             document = this.pdf_doc;
             num      = 0;
         }
         else if ((fileType == ".doc") || (fileType == ".docx"))
         {
             document = this.doc_to_pdf(progress, dlg, 0);
             num      = 50;
         }
         else if ((fileType == ".xls") || (fileType == ".xlsx"))
         {
             document = this.xls_to_pdf(progress, dlg, 0);
             num      = 50;
         }
         else if ((fileType == ".ppt") || (fileType == ".pptx"))
         {
             document = this.ppt_to_pdf(progress, dlg, 0);
             num      = 50;
         }
         PdfExtractor extractor    = new PdfExtractor(document);
         FileStream   outputStream = new FileStream(this.global_config.target_dic + Path.GetFileNameWithoutExtension(this.file_path) + this.get_suffix(), FileMode.Create);
         extractor.ExtractTextMode = 0;
         if (progress != null)
         {
             dlg.Invoke(progress, new object[] { num });
         }
         for (int i = 1; i <= document.Pages.Count; i++)
         {
             extractor.StartPage = i;
             extractor.EndPage   = i;
             extractor.ExtractText(Encoding.UTF8);
             extractor.GetText(outputStream);
             if (progress != null)
             {
                 if (num == 50)
                 {
                     dlg.Invoke(progress, new object[] { ((i * 50) / document.Pages.Count) + 50 });
                 }
                 else
                 {
                     dlg.Invoke(progress, new object[] { (i * 100) / document.Pages.Count });
                 }
             }
         }
         outputStream.Close();
     }
     catch (Exception)
     {
         return;
     }
     if (progress != null)
     {
         dlg.Invoke(progress, new object[] { 100 });
     }
 }