public static void Run() { // ExStart:ExtractText // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text(); // Open input PDF PdfExtractor pdfExtractor = new PdfExtractor(); pdfExtractor.BindPdf(dataDir + "ExtractText.pdf"); // Use parameterless ExtractText method pdfExtractor.ExtractText(); MemoryStream tempMemoryStream = new MemoryStream(); pdfExtractor.GetText(tempMemoryStream); string text = ""; // Specify Unicode encoding type in StreamReader constructor using (StreamReader streamReader = new StreamReader(tempMemoryStream, Encoding.Unicode)) { streamReader.BaseStream.Seek(0, SeekOrigin.Begin); text = streamReader.ReadToEnd(); } File.WriteAllText(dataDir + "output_out.txt", text); // ExEnd:ExtractText }
public static void Run() { // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text(); //open input PDF PdfExtractor pdfExtractor = new PdfExtractor(); pdfExtractor.BindPdf(dataDir+ "ExtractText-PageRange.pdf"); //specify start and end pages pdfExtractor.StartPage = 1; pdfExtractor.EndPage = 1; //use parameterless ExtractText method pdfExtractor.ExtractText(); MemoryStream tempMemoryStream = new MemoryStream(); pdfExtractor.GetText(tempMemoryStream); string text = ""; //specify Unicode encoding type in StreamReader constructor using (StreamReader sr = new StreamReader(tempMemoryStream,Encoding.Unicode)) { sr.BaseStream.Seek(0, SeekOrigin.Begin); text = sr.ReadToEnd(); } File.WriteAllText(dataDir+ "output.txt", text); }
public static void Main() { // The path to the documents directory. string dataDir = Path.GetFullPath("../../../Data/"); //open input PDF PdfExtractor pdfExtractor = new PdfExtractor(); pdfExtractor.BindPdf(dataDir+ "input.pdf"); //use parameterless ExtractText method pdfExtractor.ExtractText(); int pageNumber = 1; while (pdfExtractor.HasNextPageText()) { MemoryStream tempMemoryStream = new MemoryStream(); pdfExtractor.GetNextPageText(tempMemoryStream); string text = ""; //specify Unicode encoding type in StreamReader constructor using (StreamReader streamReader = new StreamReader(tempMemoryStream, Encoding.Unicode)) { streamReader.BaseStream.Seek(0, SeekOrigin.Begin); text = streamReader.ReadToEnd(); } File.WriteAllText(dataDir+ "output" + pageNumber + ".txt", text); pageNumber++; } }
private void ParsePDF(ref FileObject fo, string filePath) { Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(filePath); PdfFileInfo pi = new PdfFileInfo(pdfDocument); PdfExtractor pe = new PdfExtractor(pdfDocument); ImagePlacementAbsorber abs = new ImagePlacementAbsorber(); fo.pageCount = pi.NumberOfPages; fo.embeddedDocsCount = pdfDocument.EmbeddedFiles.Count; pdfDocument.Pages.Accept(abs); fo.imageCount = abs.ImagePlacements.Count; fo.hasPassword = pi.HasOpenPassword; pe.ExtractText(Encoding.ASCII); string tmpFolderToExtract = tmpFolder + "\\" + Guid.NewGuid(); Directory.CreateDirectory(tmpFolderToExtract); string tmpTextFile = tmpFolderToExtract + "\\" + "tmpTextexport.txt"; pe.GetText(tmpTextFile); fo.wordCount = GetWordCount(tmpTextFile); fo.characterCount = GetCharCount(tmpTextFile); if (File.Exists(tmpTextFile)) { File.Delete(tmpTextFile); } if (Directory.Exists(tmpFolderToExtract)) { Directory.Delete(tmpFolderToExtract); } }
public static void Main() { // The path to the documents directory. string dataDir = Path.GetFullPath("../../../Data/"); //open input PDF PdfExtractor pdfExtractor = new PdfExtractor(); pdfExtractor.BindPdf(dataDir + "input.pdf"); //specify start and end pages pdfExtractor.StartPage = 1; pdfExtractor.EndPage = 1; //use parameterless ExtractText method pdfExtractor.ExtractText(); MemoryStream tempMemoryStream = new MemoryStream(); pdfExtractor.GetText(tempMemoryStream); string text = ""; //specify Unicode encoding type in StreamReader constructor using (StreamReader sr = new StreamReader(tempMemoryStream, Encoding.Unicode)) { sr.BaseStream.Seek(0, SeekOrigin.Begin); text = sr.ReadToEnd(); } File.WriteAllText(dataDir + "output.txt", text); }
public static void Run() { // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text(); //open input PDF PdfExtractor pdfExtractor = new PdfExtractor(); pdfExtractor.BindPdf(dataDir + "ExtractText-Page.pdf"); //use parameterless ExtractText method pdfExtractor.ExtractText(); int pageNumber = 1; while (pdfExtractor.HasNextPageText()) { MemoryStream tempMemoryStream = new MemoryStream(); pdfExtractor.GetNextPageText(tempMemoryStream); string text = ""; //specify Unicode encoding type in StreamReader constructor using (StreamReader streamReader = new StreamReader(tempMemoryStream, Encoding.Unicode)) { streamReader.BaseStream.Seek(0, SeekOrigin.Begin); text = streamReader.ReadToEnd(); } File.WriteAllText(dataDir + "output" + pageNumber + ".txt", text); pageNumber++; } }
public static void Run() { // ExStart:ExtractTextPage // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_Text(); // Open input PDF PdfExtractor pdfExtractor = new PdfExtractor(); pdfExtractor.BindPdf(dataDir+ "ExtractText-Page.pdf"); // Use parameterless ExtractText method pdfExtractor.ExtractText(); int pageNumber = 1; while (pdfExtractor.HasNextPageText()) { MemoryStream tempMemoryStream = new MemoryStream(); pdfExtractor.GetNextPageText(tempMemoryStream); string text = ""; // Specify Unicode encoding type in StreamReader constructor using (StreamReader streamReader = new StreamReader(tempMemoryStream, Encoding.Unicode)) { streamReader.BaseStream.Seek(0, SeekOrigin.Begin); text = streamReader.ReadToEnd(); } File.WriteAllText(dataDir+ "output" + pageNumber + "_out.txt", text); pageNumber++; } // ExEnd:ExtractTextPage }
public static void Run() { // ExStart:PdfContainsTextOrImages // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles(); // Instantiate a memoryStream object to hold the extracted text from Document MemoryStream ms = new MemoryStream(); // Instantiate PdfExtractor object PdfExtractor extractor = new PdfExtractor(); // Bind the input PDF document to extractor extractor.BindPdf(dataDir + "FilledForm.pdf"); // Extract text from the input PDF document extractor.ExtractText(); bool containsText = false; bool containsImage = false; // Save the extracted text to a text file extractor.GetText(ms); // Check if the MemoryStream length is greater than or equal to 1 if (ms.Length >= 1) { containsText = true; } // Extract images from the input PDF document extractor.ExtractImage(); // Calling HasNextImage method in while loop. When images will finish, loop will exit if (extractor.HasNextImage()) { containsImage = true; } // Now find out whether this PDF is text only or image only if (containsText == true && containsImage == false) { Console.WriteLine("PDF contains text only"); } else if (containsText == false && containsImage == true) { Console.WriteLine("PDF contains image only"); } else if (containsText == true && containsImage == true) { Console.WriteLine("PDF contains both text and image"); } else if (containsText == false && containsImage == false) { Console.WriteLine("PDF contains neither text or nor image"); } // ExEnd:PdfContainsTextOrImages }
public void PdfExtractTextRpdTest(string path, string plus, string code) { var pdfExtractor = new PdfExtractor(new ContentImageExtractor()); var rdpExtractor = new RpdContentExtractor(new RpdExtractorConfig(new List <string> { plus }, new List <string>(), @"(?<code>\d\d\.0\d\.\d\d)($|\D)")); var bytes = File.ReadAllBytes(path); var content = pdfExtractor.ExtractText(bytes, ".pdf"); var extract = rdpExtractor.Extract(content.Content); Assert.True(extract.Codes.Count > 0); Assert.AreEqual(code, extract.Codes.First()); Assert.AreEqual(DocumentType.Rpd, extract.DocumentType); }
public static void Run() { // ExStart:PdfContainsTextOrImages // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles(); // Instantiate a memoryStream object to hold the extracted text from Document MemoryStream ms = new MemoryStream(); // Instantiate PdfExtractor object PdfExtractor extractor = new PdfExtractor(); // Bind the input PDF document to extractor extractor.BindPdf(dataDir + "FilledForm.pdf"); // Extract text from the input PDF document extractor.ExtractText(); bool containsText = false; bool containsImage = false; // Save the extracted text to a text file extractor.GetText(ms); // Check if the MemoryStream length is greater than or equal to 1 if (ms.Length >= 1) containsText = true; // Extract images from the input PDF document extractor.ExtractImage(); // Calling HasNextImage method in while loop. When images will finish, loop will exit if (extractor.HasNextImage()) containsImage = true; // Now find out whether this PDF is text only or image only if (containsText == true && containsImage == false) Console.WriteLine("PDF contains text only"); else if (containsText == false && containsImage == true) Console.WriteLine("PDF contains image only"); else if (containsText == true && containsImage == true) Console.WriteLine("PDF contains both text and image"); else if (containsText == false && containsImage == false) Console.WriteLine("PDF contains neither text or nor image"); // ExEnd:PdfContainsTextOrImages }
public static void Run() { // ExStart:PdfExtractorFeatures // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles(); // Create an instance of PdfExtractor class PdfExtractor extractor = new PdfExtractor(); // Set PDF file password extractor.Password = ""; // Specify start and end pages of the PDF extractor.StartPage = 1; extractor.EndPage = 10; // Bind PDF file with the extractor object extractor.BindPdf(dataDir + "inFile.pdf"); // Extract all text from the PDF extractor.ExtractText(); // Save extracted text in a text file extractor.GetText(dataDir + "PdfExtractorFeatures_text_out_.txt"); // Text of individual pages can also be saved individually in single text files if (extractor.HasNextPageText()) { extractor.GetNextPageText(dataDir + DateTime.Now.Ticks.ToString() + "_out_.txt"); } // Extract images from PDF file extractor.ExtractImage(); // Save each individual image in an image file if (extractor.HasNextImage()) { extractor.GetNextImage(dataDir + DateTime.Now.Ticks.ToString() + "_out_.jpg", System.Drawing.Imaging.ImageFormat.Jpeg); } // Extract attachments extractor.ExtractAttachment(); extractor.GetAttachment(dataDir); // ExEnd:PdfExtractorFeatures }
public static void Run() { // ExStart:PdfExtractorFeatures // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdfFacades_TechnicalArticles(); // Create an instance of PdfExtractor class PdfExtractor extractor = new PdfExtractor(); // Set PDF file password extractor.Password = ""; // Specify start and end pages of the PDF extractor.StartPage = 1; extractor.EndPage = 10; // Bind PDF file with the extractor object extractor.BindPdf( dataDir + "inFile.pdf"); // Extract all text from the PDF extractor.ExtractText(); // Save extracted text in a text file extractor.GetText(dataDir + "PdfExtractorFeatures_text_out.txt"); // Text of individual pages can also be saved individually in single text files if (extractor.HasNextPageText()) { extractor.GetNextPageText(dataDir + DateTime.Now.Ticks.ToString() + "_out.txt"); } // Extract images from PDF file extractor.ExtractImage(); // Save each individual image in an image file if (extractor.HasNextImage()) { extractor.GetNextImage(dataDir + DateTime.Now.Ticks.ToString() + "_out.jpg", System.Drawing.Imaging.ImageFormat.Jpeg); } // Extract attachments extractor.ExtractAttachment(); extractor.GetAttachment(dataDir); // ExEnd:PdfExtractorFeatures }
private void pdf_to_txt(save_progress progress, System.Windows.Forms.Form dlg, string fileType) { try { Aspose.Pdf.Document document = null; int num = 0; if (fileType == ".pdf") { document = this.pdf_doc; num = 0; } else if ((fileType == ".doc") || (fileType == ".docx")) { document = this.doc_to_pdf(progress, dlg, 0); num = 50; } else if ((fileType == ".xls") || (fileType == ".xlsx")) { document = this.xls_to_pdf(progress, dlg, 0); num = 50; } else if ((fileType == ".ppt") || (fileType == ".pptx")) { document = this.ppt_to_pdf(progress, dlg, 0); num = 50; } PdfExtractor extractor = new PdfExtractor(document); FileStream outputStream = new FileStream(this.global_config.target_dic + Path.GetFileNameWithoutExtension(this.file_path) + this.get_suffix(), FileMode.Create); extractor.ExtractTextMode = 0; if (progress != null) { dlg.Invoke(progress, new object[] { num }); } for (int i = 1; i <= document.Pages.Count; i++) { extractor.StartPage = i; extractor.EndPage = i; extractor.ExtractText(Encoding.UTF8); extractor.GetText(outputStream); if (progress != null) { if (num == 50) { dlg.Invoke(progress, new object[] { ((i * 50) / document.Pages.Count) + 50 }); } else { dlg.Invoke(progress, new object[] { (i * 100) / document.Pages.Count }); } } } outputStream.Close(); } catch (Exception) { return; } if (progress != null) { dlg.Invoke(progress, new object[] { 100 }); } }