public static void Run() { // ExStart:ExtractTextFromPageRegion // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_Text(); // Open document Document pdfDocument = new Document(dataDir + "ExtractTextAll.pdf"); // Create TextAbsorber object to extract text TextAbsorber absorber = new TextAbsorber(); absorber.TextSearchOptions.LimitToPageBounds = true; absorber.TextSearchOptions.Rectangle = new Aspose.Pdf.Rectangle(100, 200, 250, 350); // Accept the absorber for first page pdfDocument.Pages[1].Accept(absorber); // Get the extracted text string extractedText = absorber.Text; // Create a writer and open the file TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt"); // Write a line of text to the file tw.WriteLine(extractedText); // Close the stream tw.Close(); // ExEnd:ExtractTextFromPageRegion }
public void IgnoreNoscriptElements(bool ignoreNoscriptElements) { //ExStart //ExFor:HtmlLoadOptions.IgnoreNoscriptElements //ExSummary:Shows how to ignore <noscript> HTML elements. const string html = @" <html> <head> <title>NOSCRIPT</title> <meta http-equiv=""Content-Type"" content=""text/html; charset=utf-8""> <script type=""text/javascript""> alert(""Hello, world!""); </script> </head> <body> <noscript><p>Your browser does not support JavaScript!</p></noscript> </body> </html>"; HtmlLoadOptions htmlLoadOptions = new HtmlLoadOptions(); htmlLoadOptions.IgnoreNoscriptElements = ignoreNoscriptElements; Document doc = new Document(new MemoryStream(Encoding.UTF8.GetBytes(html)), htmlLoadOptions); doc.Save(ArtifactsDir + "HtmlLoadOptions.IgnoreNoscriptElements.pdf"); //ExEnd Aspose.Pdf.Document pdfDoc = new Aspose.Pdf.Document(ArtifactsDir + "HtmlLoadOptions.IgnoreNoscriptElements.pdf"); TextAbsorber textAbsorber = new TextAbsorber(); textAbsorber.Visit(pdfDoc); Assert.AreEqual(ignoreNoscriptElements ? "" : "Your browser does not support JavaScript!", textAbsorber.Text); }
public static void Run() { // ExStart:ExtractTextPage // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_Text(); // Open document Document pdfDocument = new Document(dataDir + "ExtractTextPage.pdf"); // Create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); // Accept the absorber for a particular page pdfDocument.Pages[1].Accept(textAbsorber); // Get the extracted text string extractedText = textAbsorber.Text; dataDir = dataDir + "extracted-text_out.txt"; // Create a writer and open the file TextWriter tw = new StreamWriter(dataDir); // Write a line of text to the file tw.WriteLine(extractedText); // Close the stream tw.Close(); // ExEnd:ExtractTextPage Console.WriteLine("\nText extracted successfully from Pages of PDF Document.\nFile saved at " + dataDir); }
public static void Main(string[] args) { // The path to the documents directory. string dataDir = Path.GetFullPath("../../../Data/"); //open document Document pdfDocument = new Document(dataDir + "input.pdf"); //create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); //accept the absorber for all the pages pdfDocument.Pages.Accept(textAbsorber); //get the extracted text string extractedText = textAbsorber.Text; // create a writer and open the file TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt"); // write a line of text to the file tw.WriteLine(extractedText); // close the stream tw.Close(); }
public static void Run() { // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_Text(); //open document Document pdfDocument = new Document(dataDir + "ExtractTextAll.pdf"); //create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); //accept the absorber for all the pages pdfDocument.Pages.Accept(textAbsorber); //get the extracted text string extractedText = textAbsorber.Text; // create a writer and open the file TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt"); // write a line of text to the file tw.WriteLine(extractedText); // close the stream tw.Close(); }
private static string ExtractPdfText(Document doc) { var textAbsorber = new TextAbsorber { ExtractionOptions = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Pure) }; for (var i = 1; i <= Math.Min(doc.Pages.Count, 20); i++) { try { doc.Pages[i].Accept(textAbsorber); } catch (IndexOutOfRangeException) { } catch (ArgumentException) { } catch (EndOfStreamException) { break; } } return(textAbsorber.Text); }
public static decimal?FindTotalActiva(string path, int year) { decimal?result = null; using (var document = new Document(path)) { var page = FindPage(document); if (page.TextFragments.Count > 0) { var activaPage = page.TextFragments[1].Page; var totalActivaAbsober = new TextAbsorber(); totalActivaAbsober.Visit(activaPage); var lines = totalActivaAbsober.Text.Split('\n').Select(l => l.Replace(" ", " ").ToLowerInvariant()).ToArray(); if (!TryFindValue(lines, TOTAL_ACTIVA_MARKER, year, out result)) { TryFindValue(lines, TOTAL_ACTIVA_MARKER_FALLBACK, year, out result); } } } return(result); }
public static bool HasText(IDoc doc) { try { var ta = new TextAbsorber(); using (var document = new Document(doc.Stream)) { foreach (Page page in document.Pages) { page.Accept(ta); if (ta.Text.Trim(' ', '\n', '\r').Length != 0) { return(true); } } } } catch (Exception e) { var identifier = string.IsNullOrWhiteSpace(doc.Identifier) ? "???" : doc.Identifier; Log.Error(e, "HasText für {Identifier} failed.", identifier); } return(false); }
private void RemoveFooter(Aspose.Pdf.Document pdfDoc) { try { for (int i = 1; i <= pdfDoc.Pages.Count; i++) { Page page = pdfDoc.Pages[i]; Aspose.Pdf.Rectangle rect = new Aspose.Pdf.Rectangle(0, 75, page.Rect.Width, 1); RedactionAnnotation annot = new RedactionAnnotation(page, rect); annot.FillColor = Aspose.Pdf.Color.White; annot.BorderColor = Aspose.Pdf.Color.Yellow; annot.Color = Aspose.Pdf.Color.White; annot.TextAlignment = Aspose.Pdf.HorizontalAlignment.Center; page.Annotations.Add(annot); annot.Redact(); TextAbsorber textAbsorber = new TextAbsorber(); pdfDoc.Pages[i].Accept(textAbsorber); } } catch (Exception ex) { throw ex; } }
public static void Run() { // ExStart:ExtractColumnsText // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_Text(); // Open document Document pdfDocument = new Document(dataDir + "ExtractTextPage.pdf"); TextFragmentAbsorber tfa = new TextFragmentAbsorber(); pdfDocument.Pages.Accept(tfa); TextFragmentCollection tfc = tfa.TextFragments; foreach (TextFragment tf in tfc) { // Need to reduce font size at least for 70% tf.TextState.FontSize = tf.TextState.FontSize * 0.7f; } Stream st = new MemoryStream(); pdfDocument.Save(st); pdfDocument = new Document(st); TextAbsorber textAbsorber = new TextAbsorber(); pdfDocument.Pages.Accept(textAbsorber); String extractedText = textAbsorber.Text; textAbsorber.Visit(pdfDocument); dataDir = dataDir + "ExtractColumnsText_out.txt"; System.IO.File.WriteAllText(dataDir, extractedText); // ExEnd:ExtractColumnsText Console.WriteLine("\nColumns text extracted successfully from Pages of PDF Document.\nFile saved at " + dataDir); }
private static string GetTextFromPdf(Document pdfDocument) { var textAbsorber = new TextAbsorber(); pdfDocument.Pages.Accept(textAbsorber); return(textAbsorber.Text); }
public void ToTxt(string absoluteFilePath, string outputPath) { var txtAbsorber = new TextAbsorber(); using (var pdfDocument = new Aspose.Pdf.Document(absoluteFilePath)) { pdfDocument.Pages.Accept(txtAbsorber); File.WriteAllText(outputPath, txtAbsorber.Text); } }
private static string GetContent(Document pdfDocument) { TextAbsorber textAbsorber = new TextAbsorber(); //accept the absorber for all the pages pdfDocument.Pages.Accept(textAbsorber); //get the extracted text var content = textAbsorber.Text; return Regex.Replace(content, @"\s", ""); }
public void CreateIndex(Analyzer analayer) { FSDirectory fsDir = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder)); IndexWriter indexWriter = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); Stopwatch stopWatch = Stopwatch.StartNew(); int analyzedCount = 0; string[] files = System.IO.Directory.GetFiles(_textFilesFolder, this._fileSearchPattern, SearchOption.AllDirectories); //统计需要索引的文件页数 int totalPages = GetTotalPages(files); WriteLog("Total pages statistics takes {0}ms", stopWatch.Elapsed.Milliseconds); stopWatch.Restart(); TextAbsorber textAbsorber = new TextAbsorber(); //开始索引 foreach (string pdfFile in files) { var fileInfo = new FileInfo(pdfFile); var fileName = fileInfo.Name; Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(pdfFile); WriteLog("Current file is {0}", pdfFile); //注意pdf页码从1开始 for (int i = 1; i <= pdfDocument.Pages.Count; i++) { Page page = pdfDocument.Pages[i]; page.Accept(textAbsorber); string pageContent = textAbsorber.Text; Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field(LuceneConfig.Field_Path, pdfFile, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(LuceneConfig.Field_FileName, fileName, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_PageNumber, i.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_ContentByPage, pageContent, Field.Store.NO, Field.Index.ANALYZED)); indexWriter.AddDocument(doc); analyzedCount++; RaiseProgressChanged(analyzedCount * 100 / totalPages); } } indexWriter.Optimize(); indexWriter.Dispose(); stopWatch.Stop(); Console.WriteLine("All completed. It takes {0}ms", stopWatch.Elapsed); }
//public List<int> ReadPdfFile(string fileName, String searthText) //{ // List<int> pages = new List<int>(); // if (File.Exists(fileName)) // { // //PdfReader pdfReader = new PdfReader(fileName); // for (int page = 1; page <= Doc.PageCount; page++) // { // //ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); // string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); // if (currentPageText.Contains(searthText)) // { // pages.Add(page); // } // } // pdfReader.Close(); // } // return pages; //} private void ExtractData1() { var pdfDocument = new Document(ActiveFileName); TextAbsorber textAbsorber = new TextAbsorber(); pdfDocument.Pages.Accept(textAbsorber); String extractedText = textAbsorber.Text; textAbsorber.Visit(pdfDocument); File.WriteAllText(@"demodata.txt", extractedText); }
public void CreateIndex(Analyzer analayer) { FSDirectory fsDir = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder)); IndexWriter indexWriter = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); Stopwatch stopWatch = Stopwatch.StartNew(); int analyzedCount = 0; string[] files = System.IO.Directory.GetFiles(_textFilesFolder, this._fileSearchPattern, SearchOption.AllDirectories); //统计需要索引的文件页数 int totalPages = GetTotalPages(files); WriteLog("Total pages statistics takes {0}ms", stopWatch.Elapsed.Milliseconds); stopWatch.Restart(); TextAbsorber textAbsorber = new TextAbsorber(); //开始索引 foreach (string pdfFile in files) { var fileInfo = new FileInfo(pdfFile); var fileName = fileInfo.Name; Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(pdfFile); WriteLog("Current file is {0}", pdfFile); //注意pdf页码从1开始 for (int i = 1;i<=pdfDocument.Pages.Count;i++) { Page page = pdfDocument.Pages[i]; page.Accept(textAbsorber); string pageContent = textAbsorber.Text; Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field(LuceneConfig.Field_Path, pdfFile, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(LuceneConfig.Field_FileName, fileName, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_PageNumber, i.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_ContentByPage, pageContent, Field.Store.NO, Field.Index.ANALYZED)); indexWriter.AddDocument(doc); analyzedCount++; RaiseProgressChanged(analyzedCount * 100 / totalPages); } } indexWriter.Optimize(); indexWriter.Dispose(); stopWatch.Stop(); Console.WriteLine("All completed. It takes {0}ms", stopWatch.Elapsed); }
public void Dictionary() { //ExStart //ExFor:Hyphenation.IsDictionaryRegistered(String) //ExFor:Hyphenation.RegisterDictionary(String, String) //ExFor:Hyphenation.UnregisterDictionary(String) //ExSummary:Shows how to register a hyphenation dictionary. // A hyphenation dictionary contains a list of strings that define hyphenation rules for the dictionary's language. // When a document contains lines of text in which a word could be split up and continued on the next line, // hyphenation will look through the dictionary's list of strings for that word's substrings. // If the dictionary contains a substring, then hyphenation will split the word across two lines // by the substring, and add a hyphen to the first half. // Register a dictionary file from the local file system to the "de-CH" locale. Hyphenation.RegisterDictionary("de-CH", MyDir + "hyph_de_CH.dic"); Assert.True(Hyphenation.IsDictionaryRegistered("de-CH")); // Open a document containing text with a locale matching that of our dictionary, // and save it to a fixed-page save format. The text in that document will be hyphenated. Document doc = new Document(MyDir + "German text.docx"); Assert.True(doc.FirstSection.Body.FirstParagraph.Runs.OfType <Run>().All( r => r.Font.LocaleId == new CultureInfo("de-CH").LCID)); doc.Save(ArtifactsDir + "Hyphenation.Dictionary.Registered.pdf"); // Re-load the document after un-registering the dictionary, // and save it to another PDF, which will not have hyphenated text. Hyphenation.UnregisterDictionary("de-CH"); Assert.False(Hyphenation.IsDictionaryRegistered("de-CH")); doc = new Document(MyDir + "German text.docx"); doc.Save(ArtifactsDir + "Hyphenation.Dictionary.Unregistered.pdf"); //ExEnd #if NET462 || NETCOREAPP2_1 || JAVA Aspose.Pdf.Document pdfDoc = new Aspose.Pdf.Document(ArtifactsDir + "Hyphenation.Dictionary.Registered.pdf"); TextAbsorber textAbsorber = new TextAbsorber(); textAbsorber.Visit(pdfDoc); Assert.True(textAbsorber.Text.Contains("La ob storen an deinen am sachen. Dop-\r\n" + "pelte um da am spateren verlogen ge-\r\n" + "kommen achtzehn blaulich.")); pdfDoc = new Aspose.Pdf.Document(ArtifactsDir + "Hyphenation.Dictionary.Unregistered.pdf"); textAbsorber = new TextAbsorber(); textAbsorber.Visit(pdfDoc); Assert.True(textAbsorber.Text.Contains("La ob storen an deinen am sachen. \r\n" + "Doppelte um da am spateren verlogen \r\n" + "gekommen achtzehn blaulich.")); #endif }
private static string GetContent(Document pdfDocument) { TextAbsorber textAbsorber = new TextAbsorber(); //accept the absorber for all the pages pdfDocument.Pages.Accept(textAbsorber); //get the extracted text var content = textAbsorber.Text; return(Regex.Replace(content, @"\s", "")); }
public static void Run() { //ExStart:ExtractTextFromStampAnnotation string dataDir = RunExamples.GetDataDir_AsposePdf_StampsWatermarks(); Document doc = new Document(dataDir + "test.pdf"); StampAnnotation annot = doc.Pages[1].Annotations[3] as StampAnnotation; TextAbsorber ta = new TextAbsorber(); XForm ap = annot.Appearance["N"]; ta.Visit(ap); Console.WriteLine(ta.Text); //ExEnd: ExtractTextFromStampAnnotation }
public override string Extract(Stream stream, TextExtractorContext context) { Task.Run(() => { AsposePreviewProvider.CheckLicense(AsposePreviewProvider.LicenseProvider.Pdf); var document = new Aspose.Pdf.Document(stream); var textAbsorber = new TextAbsorber(); document.Pages.Accept(textAbsorber); IndexingTools.AddTextExtract(context.VersionId, textAbsorber.Text); }); return(string.Empty); }
private string ExtrairPagina(Document inputDocument, int pageIndex) { var options = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Raw); TextAbsorber textAbsorber = new TextAbsorber(options); string textoPagina = string.Empty; using (Document outputDocument = new Document()) { outputDocument.Pages.Add(inputDocument.Pages[pageIndex]); outputDocument.Pages.Accept(textAbsorber); textoPagina = textAbsorber.Text; } return textoPagina; }
public override ExtractionResult ExtractText(IDoc doc, ITextExtractorSettings settings) { var result = new ExtractionResult(settings.MaxExtractionSize); var textAbsorber = new TextAbsorber(); using (var pdfDocument = new Document(doc.Stream)) { pdfDocument.Pages.Accept(textAbsorber); } result.Append(textAbsorber.Text); return(result); }
public void ExportToText(Stream stream) { Document pdfDocument = new Document(_path); TextAbsorber textAbsorber = new TextAbsorber(); using (TextWriter tw = new StreamWriter(stream, Encoding.Default)) { for (int i = 1; i <= pdfDocument.Pages.Count; i++) { Page page = pdfDocument.Pages[i]; page.Accept(textAbsorber); tw.Write(textAbsorber.Text); } } }
public static bool CheckLicense() { Document pdfDocument = new Document(HttpContext.Current.Server.MapPath("Convert/output.pdf")); // Create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); // Accept the absorber for all the pages pdfDocument.Pages[1].Accept(textAbsorber); // Get the extracted text String extractedText = textAbsorber.Text; return(!extractedText.Contains("Evaluation Only. Created with Aspose.Pdf")); }
public static string btnTextExport_Click(string fileType) { Document doc = new Document(HttpContext.Current.Server.MapPath("Convert/Export.pdf")); switch (fileType) { case "txt": //create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); //accept the absorber for all the pages doc.Pages.Accept(textAbsorber); //get the extracted text string extractedText = textAbsorber.Text; System.IO.File.WriteAllText(HttpContext.Current.Server.MapPath("Convert/output.txt"), extractedText); return("output.txt"); case "pdf": return("Export.pdf"); case "docx": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.docx"), SaveFormat.DocX); return("output.docx"); case "svg": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.svg"), SaveFormat.Svg); return("output.svg"); case "xps": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xps"), SaveFormat.Xps); return("output.xps"); case "xls": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xls"), SaveFormat.Excel); return("output.xls"); case "html": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.html"), SaveFormat.Html); return("output.html"); default: return(""); } }
protected override string ExtractText(string extensionName, byte[] data) { StringBuilder stringBuilder = new StringBuilder(); using (MemoryStream memoryStream = new MemoryStream(data)) { using (Document doc = new Document(memoryStream)) { TextAbsorber textAbsorber = new TextAbsorber(); doc.Pages.Accept(textAbsorber); stringBuilder.Append(textAbsorber.Text); } } return(Regex.Replace(stringBuilder.ToString(), "Evaluation Only. Created with Aspose[\\S|\\s]* Aspose Pty Ltd.", "")); }
// ExStart:ShowLinkAnnotations public static void ShowLinkAnnotations(Page page) { foreach (Aspose.Pdf.Annotations.Annotation annot in page.Annotations) { if (annot is LinkAnnotation) { // Print the URL of each Link Annotation Console.WriteLine("URI: " + ((annot as LinkAnnotation).Action as GoToURIAction).URI); TextAbsorber absorber = new TextAbsorber(); absorber.TextSearchOptions.LimitToPageBounds = true; absorber.TextSearchOptions.Rectangle = annot.Rect; page.Accept(absorber); string extractedText = absorber.Text; // Print the text associated with hyperlink Console.WriteLine(extractedText); } } }
/// <summary> /// Сравнение двух PDF файлов по string этих файлов /// </summary> public static void ComparePDFbyString(string testDirectoty, string productionDirectory, string fileName) { Stream licenseStream = LicenseStr.LicenseStream; new License().SetLicense(licenseStream); Document pdfDocumentTest = new Document(testDirectoty + fileName); TextAbsorber textAbsorberTest = new TextAbsorber(); pdfDocumentTest.Pages.Accept(textAbsorberTest); string extractedTextTestDocument = textAbsorberTest.Text; Document pdfDocumentProduction = new Document(productionDirectory + fileName); TextAbsorber textAbsorberProduction = new TextAbsorber(); pdfDocumentProduction.Pages.Accept(textAbsorberProduction); string extractedTextProductionDocument = textAbsorberProduction.Text; Assert.AreEqual(extractedTextTestDocument, extractedTextProductionDocument); }
public IHttpActionResult ExportFile(string fileType, string folder) { var fullPath = string.Format("{0}Editor\\{1}", Config.Configuration.WorkingDirectory.Replace("/", "\\"), folder); var pdfDocumentPath = string.Format("{0}\\document.pdf", fullPath); var doc = new Document(pdfDocumentPath); try { switch (fileType) { case "txt": TextAbsorber textAbsorber = new TextAbsorber(); doc.Pages.Accept(textAbsorber); string extractedText = textAbsorber.Text; File.WriteAllText(string.Format("{0}\\document.txt", fullPath), extractedText); break; case "docx": doc.Save(string.Format("{0}\\document.docx", fullPath), SaveFormat.DocX); break; case "svg": doc.Save(string.Format("{0}\\document.svg", fullPath), SaveFormat.Svg); break; case "xps": doc.Save(string.Format("{0}\\document.xps", fullPath), SaveFormat.Xps); break; case "xls": doc.Save(string.Format("{0}\\document.xlsx", fullPath), SaveFormat.Excel); break; case "html": doc.Save(string.Format("{0}\\document.html", fullPath), SaveFormat.Html); break; } } catch (Exception e) { return(InternalServerError(e)); } return(Ok(new DocStatusModel())); }
public Boolean ReadPdfFile(string fileName, String searthText) { if (System.IO.File.Exists(fileName)) { InjectAsposeLicemse(); var document = new Aspose.Pdf.Document(fileName); TextAbsorber textAbsorber = new TextAbsorber(); document.Pages.Accept(textAbsorber); String extractedText = textAbsorber.Text; textAbsorber.Visit(document); if (extractedText.Contains(searthText, StringComparison.InvariantCultureIgnoreCase)) { return(true); } } return(false); }
public static void UsingScaleFactor() { // ExStart:UsingScaleFactor // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_Text(); // Open document Document pdfDocument = new Document(dataDir + "ExtractTextPage.pdf"); TextAbsorber textAbsorber = new TextAbsorber(); textAbsorber.ExtractionOptions = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Pure); // Setting scale factor to 0.5 is enough to split columns in the majority of documents // Setting of zero allows to algorithm choose scale factor automatically textAbsorber.ExtractionOptions.ScaleFactor = 0.5; /* 0; */ pdfDocument.Pages.Accept(textAbsorber); String extractedText = textAbsorber.Text; System.IO.File.WriteAllText( dataDir + "ExtractTextUsingScaleFactor_out.text", extractedText); // ExEnd:UsingScaleFactor }
// This method gets called by the runtime. Use this method to configure the HTTP request pipeline. public void Configure(IApplicationBuilder app, IHostingEnvironment env) { if (env.IsDevelopment()) { app.UseDeveloperExceptionPage(); } app.Run(async(context) => { var pdf = new Aspose.Pdf.Document("hola.pdf"); // Create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Raw)); // Accept the absorber for all the pages pdf.Pages[1].Accept(textAbsorber); String extractedText = textAbsorber.Text; //string greeting = Configuration["My:Greeting"]; await context.Response.WriteAsync(extractedText); }); }
public void UpdateSdtContent(bool updateSdtContent) { //ExStart //ExFor:SaveOptions.UpdateSdtContent //ExSummary:Shows how to update structured document tags while saving a document to PDF. Document doc = new Document(); // Insert a drop-down list structured document tag. StructuredDocumentTag tag = new StructuredDocumentTag(doc, SdtType.DropDownList, MarkupLevel.Block); tag.ListItems.Add(new SdtListItem("Value 1")); tag.ListItems.Add(new SdtListItem("Value 2")); tag.ListItems.Add(new SdtListItem("Value 3")); // The drop-down list currently displays "Choose an item" as the default text. // Set the "SelectedValue" property to one of the list items to get the tag to // display that list item's value instead of the default text. tag.ListItems.SelectedValue = tag.ListItems[1]; doc.FirstSection.Body.AppendChild(tag); // Create a "PdfSaveOptions" object to pass to the document's "Save" method // to modify how that method saves the document to .PDF. PdfSaveOptions options = new PdfSaveOptions(); // Set the "UpdateSdtContent" property to "false" not to update the structured document tags // while saving the document to PDF. They will display their default values as they were at the time of construction. // Set the "UpdateSdtContent" property to "true" to make sure the tags display updated values in the PDF. options.UpdateSdtContent = updateSdtContent; doc.Save(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf", options); //ExEnd #if NET462 || NETCOREAPP2_1 || JAVA Aspose.Pdf.Document pdfDoc = new Aspose.Pdf.Document(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf"); TextAbsorber textAbsorber = new TextAbsorber(); textAbsorber.Visit(pdfDoc); Assert.AreEqual(updateSdtContent ? "Value 2" : "Choose an item.", textAbsorber.Text); #endif }
public void SuppressHyphens(bool suppressAutoHyphens) { //ExStart //ExFor:ParagraphFormat.SuppressAutoHyphens //ExSummary:Shows how to suppress hyphenation for a paragraph. Hyphenation.RegisterDictionary("de-CH", MyDir + "hyph_de_CH.dic"); Assert.True(Hyphenation.IsDictionaryRegistered("de-CH")); // Open a document containing text with a locale matching that of our dictionary. // When we save this document to a fixed page save format, its text will have hyphenation. Document doc = new Document(MyDir + "German text.docx"); // We can set the "SuppressAutoHyphens" property to "true" to disable hyphenation // for a specific paragraph while keeping it enabled for the rest of the document. // The default value for this property is "false", // which means every paragraph by default uses hyphenation if any is available. doc.FirstSection.Body.FirstParagraph.ParagraphFormat.SuppressAutoHyphens = suppressAutoHyphens; doc.Save(ArtifactsDir + "ParagraphFormat.SuppressHyphens.pdf"); //ExEnd #if NET462 || NETCOREAPP2_1 || JAVA Aspose.Pdf.Document pdfDoc = new Aspose.Pdf.Document(ArtifactsDir + "ParagraphFormat.SuppressHyphens.pdf"); TextAbsorber textAbsorber = new TextAbsorber(); textAbsorber.Visit(pdfDoc); if (suppressAutoHyphens) { Assert.True(textAbsorber.Text.Contains("La ob storen an deinen am sachen. \r\n" + "Doppelte um da am spateren verlogen \r\n" + "gekommen achtzehn blaulich.")); } else { Assert.True(textAbsorber.Text.Contains("La ob storen an deinen am sachen. Dop-\r\n" + "pelte um da am spateren verlogen ge-\r\n" + "kommen achtzehn blaulich.")); } #endif }
public void UpdateSdtContent(bool updateSdtContent) { //ExStart //ExFor:SaveOptions.UpdateSdtContent //ExSummary:Shows how structured document tags can be updated while saving to .pdf. Document doc = new Document(); // Insert two StructuredDocumentTags; a date and a drop-down list StructuredDocumentTag tag = new StructuredDocumentTag(doc, SdtType.Date, MarkupLevel.Block); tag.FullDate = DateTime.Now; doc.FirstSection.Body.AppendChild(tag); tag = new StructuredDocumentTag(doc, SdtType.DropDownList, MarkupLevel.Block); tag.ListItems.Add(new SdtListItem("Value 1")); tag.ListItems.Add(new SdtListItem("Value 2")); tag.ListItems.Add(new SdtListItem("Value 3")); tag.ListItems.SelectedValue = tag.ListItems[1]; doc.FirstSection.Body.AppendChild(tag); // We've selected default values for both tags // We can save those values in the document without immediately updating the tags, leaving them in their default state // by using a SaveOptions object with this flag set PdfSaveOptions options = new PdfSaveOptions(); options.UpdateSdtContent = updateSdtContent; doc.Save(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf", options); //ExEnd #if NET462 || NETCOREAPP2_1 || JAVA Aspose.Pdf.Document pdfDoc = new Aspose.Pdf.Document(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf"); TextAbsorber textAbsorber = new TextAbsorber(); textAbsorber.Visit(pdfDoc); Assert.AreEqual(updateSdtContent ? "Value 2" : $"Click here to enter a date.{Environment.NewLine}Choose an item.", textAbsorber.Text); #endif }
static void TestRead() { Document pdfDocument = new Document(Config.TestPdf); //create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); //accept the absorber for a particular page pdfDocument.Pages[104].Accept(textAbsorber); //get the extracted text string extractedText = textAbsorber.Text; // create a writer and open the file TextWriter tw = new StreamWriter(Console.OpenStandardOutput(), Encoding.Default); // write a line of text to the file tw.WriteLine(extractedText); // close the stream tw.Close(); }
public static bool CheckLicense() { Document pdfDocument = new Document(HttpContext.Current.Server.MapPath("Convert/output.pdf")); // Create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); // Accept the absorber for all the pages pdfDocument.Pages[1].Accept(textAbsorber); // Get the extracted text String extractedText = textAbsorber.Text; if (extractedText.Contains("Evaluation Only. Created with Aspose.Pdf")) { return false; } else { return true; } }
public static string btnTextExport_Click(string fileType) { Document doc = new Document(HttpContext.Current.Server.MapPath("Convert/Export.pdf")); switch (fileType) { case "txt": //create TextAbsorber object to extract text TextAbsorber textAbsorber = new TextAbsorber(); //accept the absorber for all the pages doc.Pages.Accept(textAbsorber); //get the extracted text string extractedText = textAbsorber.Text; System.IO.File.WriteAllText(HttpContext.Current.Server.MapPath("Convert/output.txt"), extractedText); return "output.txt"; case "pdf": return "Export.pdf"; case "docx": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.docx"), SaveFormat.DocX); return "output.docx"; case "svg": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.svg"), SaveFormat.Svg); return "output.svg"; case "xps": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xps"), SaveFormat.Xps); return "output.xps"; case "xls": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xls"), SaveFormat.Excel); return "output.xls"; case "html": doc.Save(HttpContext.Current.Server.MapPath("Convert/output.html"), SaveFormat.Html); return "output.html"; default: return ""; } }