public static void Run()
        {
            // ExStart:ExtractTextFromPageRegion
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            // Open document
            Document pdfDocument = new Document(dataDir + "ExtractTextAll.pdf");

            // Create TextAbsorber object to extract text
            TextAbsorber absorber = new TextAbsorber();
            absorber.TextSearchOptions.LimitToPageBounds = true;
            absorber.TextSearchOptions.Rectangle = new Aspose.Pdf.Rectangle(100, 200, 250, 350);

            // Accept the absorber for first page
            pdfDocument.Pages[1].Accept(absorber);

            // Get the extracted text
            string extractedText = absorber.Text;
            // Create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt");
            // Write a line of text to the file
            tw.WriteLine(extractedText);
            // Close the stream
            tw.Close();
            // ExEnd:ExtractTextFromPageRegion          
            
        }
        public void IgnoreNoscriptElements(bool ignoreNoscriptElements)
        {
            //ExStart
            //ExFor:HtmlLoadOptions.IgnoreNoscriptElements
            //ExSummary:Shows how to ignore <noscript> HTML elements.
            const string html = @"
                <html>
                  <head>
                    <title>NOSCRIPT</title>
                      <meta http-equiv=""Content-Type"" content=""text/html; charset=utf-8"">
                      <script type=""text/javascript"">
                        alert(""Hello, world!"");
                      </script>
                  </head>
                <body>
                  <noscript><p>Your browser does not support JavaScript!</p></noscript>
                </body>
                </html>";

            HtmlLoadOptions htmlLoadOptions = new HtmlLoadOptions();

            htmlLoadOptions.IgnoreNoscriptElements = ignoreNoscriptElements;

            Document doc = new Document(new MemoryStream(Encoding.UTF8.GetBytes(html)), htmlLoadOptions);

            doc.Save(ArtifactsDir + "HtmlLoadOptions.IgnoreNoscriptElements.pdf");
            //ExEnd

            Aspose.Pdf.Document pdfDoc       = new Aspose.Pdf.Document(ArtifactsDir + "HtmlLoadOptions.IgnoreNoscriptElements.pdf");
            TextAbsorber        textAbsorber = new TextAbsorber();

            textAbsorber.Visit(pdfDoc);

            Assert.AreEqual(ignoreNoscriptElements ? "" : "Your browser does not support JavaScript!", textAbsorber.Text);
        }
Пример #3
0
        public static void Run()
        {
            // ExStart:ExtractTextPage
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            // Open document
            Document pdfDocument = new Document(dataDir + "ExtractTextPage.pdf");

            // Create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();

            // Accept the absorber for a particular page
            pdfDocument.Pages[1].Accept(textAbsorber);

            // Get the extracted text
            string extractedText = textAbsorber.Text;

            dataDir = dataDir + "extracted-text_out.txt";
            // Create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir);

            // Write a line of text to the file
            tw.WriteLine(extractedText);

            // Close the stream
            tw.Close();
            // ExEnd:ExtractTextPage
            Console.WriteLine("\nText extracted successfully from Pages of PDF Document.\nFile saved at " + dataDir);
        }
Пример #4
0
        public static void Main(string[] args)
        {
            // The path to the documents directory.
            string dataDir = Path.GetFullPath("../../../Data/");

            //open document
            Document pdfDocument = new Document(dataDir + "input.pdf");

            //create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();

            //accept the absorber for all the pages
            pdfDocument.Pages.Accept(textAbsorber);

            //get the extracted text
            string extractedText = textAbsorber.Text;

            // create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt");

            // write a line of text to the file
            tw.WriteLine(extractedText);

            // close the stream
            tw.Close();
        }
Пример #5
0
        public static void Run()
        {
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            //open document
            Document pdfDocument = new Document(dataDir + "ExtractTextAll.pdf");

            //create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();

            //accept the absorber for all the pages
            pdfDocument.Pages.Accept(textAbsorber);

            //get the extracted text
            string extractedText = textAbsorber.Text;

            // create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt");

            // write a line of text to the file
            tw.WriteLine(extractedText);

            // close the stream
            tw.Close();
            
            
        }
Пример #6
0
        private static string ExtractPdfText(Document doc)
        {
            var textAbsorber = new TextAbsorber
            {
                ExtractionOptions = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Pure)
            };

            for (var i = 1; i <= Math.Min(doc.Pages.Count, 20); i++)
            {
                try
                {
                    doc.Pages[i].Accept(textAbsorber);
                }
                catch (IndexOutOfRangeException)
                {
                }
                catch (ArgumentException)
                {
                }
                catch (EndOfStreamException)
                {
                    break;
                }
            }
            return(textAbsorber.Text);
        }
        public static void Run()
        {
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            //open document
            Document pdfDocument = new Document(dataDir + "ExtractTextAll.pdf");

            //create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();

            //accept the absorber for all the pages
            pdfDocument.Pages.Accept(textAbsorber);

            //get the extracted text
            string extractedText = textAbsorber.Text;

            // create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt");

            // write a line of text to the file
            tw.WriteLine(extractedText);

            // close the stream
            tw.Close();
        }
Пример #8
0
        public static decimal?FindTotalActiva(string path, int year)
        {
            decimal?result = null;

            using (var document = new Document(path))
            {
                var page = FindPage(document);

                if (page.TextFragments.Count > 0)
                {
                    var activaPage = page.TextFragments[1].Page;

                    var totalActivaAbsober = new TextAbsorber();
                    totalActivaAbsober.Visit(activaPage);

                    var lines = totalActivaAbsober.Text.Split('\n').Select(l => l.Replace("  ", " ").ToLowerInvariant()).ToArray();

                    if (!TryFindValue(lines, TOTAL_ACTIVA_MARKER, year, out result))
                    {
                        TryFindValue(lines, TOTAL_ACTIVA_MARKER_FALLBACK, year, out result);
                    }
                }
            }

            return(result);
        }
Пример #9
0
        public static bool HasText(IDoc doc)
        {
            try
            {
                var ta = new TextAbsorber();
                using (var document = new Document(doc.Stream))
                {
                    foreach (Page page in document.Pages)
                    {
                        page.Accept(ta);

                        if (ta.Text.Trim(' ', '\n', '\r').Length != 0)
                        {
                            return(true);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                var identifier = string.IsNullOrWhiteSpace(doc.Identifier) ? "???" : doc.Identifier;
                Log.Error(e, "HasText für {Identifier} failed.", identifier);
            }

            return(false);
        }
Пример #10
0
        private void RemoveFooter(Aspose.Pdf.Document pdfDoc)
        {
            try
            {
                for (int i = 1; i <= pdfDoc.Pages.Count; i++)
                {
                    Page page = pdfDoc.Pages[i];
                    Aspose.Pdf.Rectangle rect  = new Aspose.Pdf.Rectangle(0, 75, page.Rect.Width, 1);
                    RedactionAnnotation  annot = new RedactionAnnotation(page, rect);
                    annot.FillColor   = Aspose.Pdf.Color.White;
                    annot.BorderColor = Aspose.Pdf.Color.Yellow;
                    annot.Color       = Aspose.Pdf.Color.White;

                    annot.TextAlignment = Aspose.Pdf.HorizontalAlignment.Center;
                    page.Annotations.Add(annot);
                    annot.Redact();

                    TextAbsorber textAbsorber = new TextAbsorber();
                    pdfDoc.Pages[i].Accept(textAbsorber);
                }
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
        public static void Run()
        {
            // ExStart:ExtractColumnsText
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            // Open document
            Document pdfDocument = new Document(dataDir + "ExtractTextPage.pdf");                

            TextFragmentAbsorber tfa = new TextFragmentAbsorber();
            pdfDocument.Pages.Accept(tfa);
            TextFragmentCollection tfc = tfa.TextFragments;
            foreach (TextFragment tf in tfc)
            {
                // Need to reduce font size at least for 70%
                tf.TextState.FontSize = tf.TextState.FontSize * 0.7f;
            }
            Stream st = new MemoryStream();
            pdfDocument.Save(st);
            pdfDocument = new Document(st);
            TextAbsorber textAbsorber = new TextAbsorber();
            pdfDocument.Pages.Accept(textAbsorber);
            String extractedText = textAbsorber.Text;
            textAbsorber.Visit(pdfDocument); 

            dataDir = dataDir + "ExtractColumnsText_out.txt";

            System.IO.File.WriteAllText(dataDir, extractedText);           
            // ExEnd:ExtractColumnsText            
            Console.WriteLine("\nColumns text extracted successfully from Pages of PDF Document.\nFile saved at " + dataDir);
        }
Пример #12
0
        public static void Main(string[] args)
        {
            // The path to the documents directory.
            string dataDir = Path.GetFullPath("../../../Data/");

            //open document
            Document pdfDocument = new Document(dataDir + "input.pdf");

            //create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();

            //accept the absorber for all the pages
            pdfDocument.Pages.Accept(textAbsorber);

            //get the extracted text
            string extractedText = textAbsorber.Text;

            // create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt");

            // write a line of text to the file
            tw.WriteLine(extractedText);

            // close the stream
            tw.Close();
        }
Пример #13
0
        public static void Run()
        {
            // ExStart:ExtractTextFromPageRegion
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            // Open document
            Document pdfDocument = new Document(dataDir + "ExtractTextAll.pdf");

            // Create TextAbsorber object to extract text
            TextAbsorber absorber = new TextAbsorber();

            absorber.TextSearchOptions.LimitToPageBounds = true;
            absorber.TextSearchOptions.Rectangle         = new Aspose.Pdf.Rectangle(100, 200, 250, 350);

            // Accept the absorber for first page
            pdfDocument.Pages[1].Accept(absorber);

            // Get the extracted text
            string extractedText = absorber.Text;
            // Create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir + "extracted-text.txt");

            // Write a line of text to the file
            tw.WriteLine(extractedText);
            // Close the stream
            tw.Close();
            // ExEnd:ExtractTextFromPageRegion
        }
        public static void Run()
        {
            // ExStart:ExtractTextPage
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            // Open document
            Document pdfDocument = new Document(dataDir + "ExtractTextPage.pdf");

            // Create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();
  
            // Accept the absorber for a particular page
            pdfDocument.Pages[1].Accept(textAbsorber);
            
            // Get the extracted text
            string extractedText = textAbsorber.Text;

            dataDir = dataDir + "extracted-text_out.txt";
            // Create a writer and open the file
            TextWriter tw = new StreamWriter(dataDir);
            
            // Write a line of text to the file
            tw.WriteLine(extractedText);
            
            // Close the stream
            tw.Close();
            // ExEnd:ExtractTextPage            
            Console.WriteLine("\nText extracted successfully from Pages of PDF Document.\nFile saved at " + dataDir);
        }
Пример #15
0
        private static string GetTextFromPdf(Document pdfDocument)
        {
            var textAbsorber = new TextAbsorber();

            pdfDocument.Pages.Accept(textAbsorber);

            return(textAbsorber.Text);
        }
Пример #16
0
        public void ToTxt(string absoluteFilePath, string outputPath)
        {
            var txtAbsorber = new TextAbsorber();

            using (var pdfDocument = new Aspose.Pdf.Document(absoluteFilePath))
            {
                pdfDocument.Pages.Accept(txtAbsorber);
                File.WriteAllText(outputPath, txtAbsorber.Text);
            }
        }
Пример #17
0
        private static string GetContent(Document pdfDocument)
        {
            TextAbsorber textAbsorber = new TextAbsorber();

            //accept the absorber for all the pages
            pdfDocument.Pages.Accept(textAbsorber);

            //get the extracted text
            var content = textAbsorber.Text;
            return Regex.Replace(content, @"\s", "");
        }
Пример #18
0
        public void CreateIndex(Analyzer analayer)
        {
            FSDirectory fsDir         = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder));
            IndexWriter indexWriter   = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
            Stopwatch   stopWatch     = Stopwatch.StartNew();
            int         analyzedCount = 0;

            string[] files = System.IO.Directory.GetFiles(_textFilesFolder, this._fileSearchPattern, SearchOption.AllDirectories);

            //统计需要索引的文件页数
            int totalPages = GetTotalPages(files);

            WriteLog("Total pages statistics takes {0}ms", stopWatch.Elapsed.Milliseconds);

            stopWatch.Restart();

            TextAbsorber textAbsorber = new TextAbsorber();

            //开始索引
            foreach (string pdfFile in files)
            {
                var fileInfo = new FileInfo(pdfFile);
                var fileName = fileInfo.Name;
                Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(pdfFile);

                WriteLog("Current file is {0}", pdfFile);

                //注意pdf页码从1开始
                for (int i = 1; i <= pdfDocument.Pages.Count; i++)
                {
                    Page page = pdfDocument.Pages[i];
                    page.Accept(textAbsorber);
                    string pageContent = textAbsorber.Text;

                    Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
                    doc.Add(new Field(LuceneConfig.Field_Path, pdfFile, Field.Store.YES, Field.Index.NOT_ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_FileName, fileName, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_PageNumber, i.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_ContentByPage, pageContent, Field.Store.NO, Field.Index.ANALYZED));

                    indexWriter.AddDocument(doc);

                    analyzedCount++;

                    RaiseProgressChanged(analyzedCount * 100 / totalPages);
                }
            }

            indexWriter.Optimize();
            indexWriter.Dispose();

            stopWatch.Stop();
            Console.WriteLine("All completed. It takes {0}ms", stopWatch.Elapsed);
        }
Пример #19
0
        //public List<int> ReadPdfFile(string fileName, String searthText)
        //{
        //    List<int> pages = new List<int>();
        //    if (File.Exists(fileName))
        //    {
        //        //PdfReader pdfReader = new PdfReader(fileName);
        //        for (int page = 1; page <= Doc.PageCount; page++)
        //        {
        //            //ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();


        //            string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
        //            if (currentPageText.Contains(searthText))
        //            {
        //                pages.Add(page);
        //            }
        //        }
        //        pdfReader.Close();
        //    }
        //    return pages;
        //}

        private void ExtractData1()
        {
            var          pdfDocument  = new Document(ActiveFileName);
            TextAbsorber textAbsorber = new TextAbsorber();

            pdfDocument.Pages.Accept(textAbsorber);
            String extractedText = textAbsorber.Text;

            textAbsorber.Visit(pdfDocument);
            File.WriteAllText(@"demodata.txt", extractedText);
        }
Пример #20
0
        public void CreateIndex(Analyzer analayer)
        {
            FSDirectory fsDir = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder));
            IndexWriter indexWriter = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
            Stopwatch stopWatch = Stopwatch.StartNew();
            int analyzedCount = 0;

            string[] files = System.IO.Directory.GetFiles(_textFilesFolder, this._fileSearchPattern, SearchOption.AllDirectories);

            //统计需要索引的文件页数
            int totalPages = GetTotalPages(files);
            WriteLog("Total pages statistics takes {0}ms", stopWatch.Elapsed.Milliseconds);

            stopWatch.Restart();

            TextAbsorber textAbsorber = new TextAbsorber();
            //开始索引
            foreach (string pdfFile in files)
            {
                var fileInfo = new FileInfo(pdfFile);
                var fileName = fileInfo.Name;
                Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(pdfFile);

                WriteLog("Current file is {0}", pdfFile);

                //注意pdf页码从1开始
                for (int i = 1;i<=pdfDocument.Pages.Count;i++) 
                {
                    Page page = pdfDocument.Pages[i];
                    page.Accept(textAbsorber);
                    string pageContent = textAbsorber.Text;

                    Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
                    doc.Add(new Field(LuceneConfig.Field_Path, pdfFile, Field.Store.YES, Field.Index.NOT_ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_FileName, fileName, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_PageNumber, i.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_ContentByPage, pageContent, Field.Store.NO, Field.Index.ANALYZED));

                    indexWriter.AddDocument(doc);

                    analyzedCount++;

                    RaiseProgressChanged(analyzedCount * 100 / totalPages);
                }

                
            }

            indexWriter.Optimize();
            indexWriter.Dispose();

            stopWatch.Stop();
            Console.WriteLine("All completed. It takes {0}ms", stopWatch.Elapsed);
        }
Пример #21
0
        public void Dictionary()
        {
            //ExStart
            //ExFor:Hyphenation.IsDictionaryRegistered(String)
            //ExFor:Hyphenation.RegisterDictionary(String, String)
            //ExFor:Hyphenation.UnregisterDictionary(String)
            //ExSummary:Shows how to register a hyphenation dictionary.
            // A hyphenation dictionary contains a list of strings that define hyphenation rules for the dictionary's language.
            // When a document contains lines of text in which a word could be split up and continued on the next line,
            // hyphenation will look through the dictionary's list of strings for that word's substrings.
            // If the dictionary contains a substring, then hyphenation will split the word across two lines
            // by the substring, and add a hyphen to the first half.
            // Register a dictionary file from the local file system to the "de-CH" locale.
            Hyphenation.RegisterDictionary("de-CH", MyDir + "hyph_de_CH.dic");

            Assert.True(Hyphenation.IsDictionaryRegistered("de-CH"));

            // Open a document containing text with a locale matching that of our dictionary,
            // and save it to a fixed-page save format. The text in that document will be hyphenated.
            Document doc = new Document(MyDir + "German text.docx");

            Assert.True(doc.FirstSection.Body.FirstParagraph.Runs.OfType <Run>().All(
                            r => r.Font.LocaleId == new CultureInfo("de-CH").LCID));

            doc.Save(ArtifactsDir + "Hyphenation.Dictionary.Registered.pdf");

            // Re-load the document after un-registering the dictionary,
            // and save it to another PDF, which will not have hyphenated text.
            Hyphenation.UnregisterDictionary("de-CH");

            Assert.False(Hyphenation.IsDictionaryRegistered("de-CH"));

            doc = new Document(MyDir + "German text.docx");
            doc.Save(ArtifactsDir + "Hyphenation.Dictionary.Unregistered.pdf");
            //ExEnd

#if NET462 || NETCOREAPP2_1 || JAVA
            Aspose.Pdf.Document pdfDoc       = new Aspose.Pdf.Document(ArtifactsDir + "Hyphenation.Dictionary.Registered.pdf");
            TextAbsorber        textAbsorber = new TextAbsorber();
            textAbsorber.Visit(pdfDoc);

            Assert.True(textAbsorber.Text.Contains("La ob storen an deinen am sachen. Dop-\r\n" +
                                                   "pelte  um  da  am  spateren  verlogen  ge-\r\n" +
                                                   "kommen  achtzehn  blaulich."));

            pdfDoc       = new Aspose.Pdf.Document(ArtifactsDir + "Hyphenation.Dictionary.Unregistered.pdf");
            textAbsorber = new TextAbsorber();
            textAbsorber.Visit(pdfDoc);

            Assert.True(textAbsorber.Text.Contains("La  ob  storen  an  deinen  am  sachen. \r\n" +
                                                   "Doppelte  um  da  am  spateren  verlogen \r\n" +
                                                   "gekommen  achtzehn  blaulich."));
#endif
        }
Пример #22
0
        private static string GetContent(Document pdfDocument)
        {
            TextAbsorber textAbsorber = new TextAbsorber();

            //accept the absorber for all the pages
            pdfDocument.Pages.Accept(textAbsorber);

            //get the extracted text
            var content = textAbsorber.Text;

            return(Regex.Replace(content, @"\s", ""));
        }
        public static void Run()
        {
            //ExStart:ExtractTextFromStampAnnotation
            string          dataDir = RunExamples.GetDataDir_AsposePdf_StampsWatermarks();
            Document        doc     = new Document(dataDir + "test.pdf");
            StampAnnotation annot   = doc.Pages[1].Annotations[3] as StampAnnotation;
            TextAbsorber    ta      = new TextAbsorber();
            XForm           ap      = annot.Appearance["N"];

            ta.Visit(ap);
            Console.WriteLine(ta.Text);
            //ExEnd: ExtractTextFromStampAnnotation
        }
Пример #24
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            Task.Run(() =>
            {
                AsposePreviewProvider.CheckLicense(AsposePreviewProvider.LicenseProvider.Pdf);
                var document     = new Aspose.Pdf.Document(stream);
                var textAbsorber = new TextAbsorber();
                document.Pages.Accept(textAbsorber);
                IndexingTools.AddTextExtract(context.VersionId, textAbsorber.Text);
            });

            return(string.Empty);
        }
        private string ExtrairPagina(Document inputDocument, int pageIndex)
        {
            var options = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Raw);
            TextAbsorber textAbsorber = new TextAbsorber(options);

            string textoPagina = string.Empty;
            using (Document outputDocument = new Document())
            {
                outputDocument.Pages.Add(inputDocument.Pages[pageIndex]);
                outputDocument.Pages.Accept(textAbsorber);
                textoPagina = textAbsorber.Text;
            }
            return textoPagina;
        }
Пример #26
0
        public override ExtractionResult ExtractText(IDoc doc, ITextExtractorSettings settings)
        {
            var result = new ExtractionResult(settings.MaxExtractionSize);

            var textAbsorber = new TextAbsorber();

            using (var pdfDocument = new Document(doc.Stream))
            {
                pdfDocument.Pages.Accept(textAbsorber);
            }

            result.Append(textAbsorber.Text);

            return(result);
        }
Пример #27
0
        public void ExportToText(Stream stream) 
        {
            Document pdfDocument = new Document(_path);
            TextAbsorber textAbsorber = new TextAbsorber();

            using (TextWriter tw = new StreamWriter(stream, Encoding.Default)) 
            {
                for (int i = 1; i <= pdfDocument.Pages.Count; i++)
                {
                    Page page = pdfDocument.Pages[i];
                    page.Accept(textAbsorber);
                    tw.Write(textAbsorber.Text);
                }
            }
        }
        public static bool CheckLicense()
        {
            Document pdfDocument = new Document(HttpContext.Current.Server.MapPath("Convert/output.pdf"));

            // Create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();

            // Accept the absorber for all the pages
            pdfDocument.Pages[1].Accept(textAbsorber);

            // Get the extracted text
            String extractedText = textAbsorber.Text;

            return(!extractedText.Contains("Evaluation Only. Created with Aspose.Pdf"));
        }
Пример #29
0
        public void ExportToText(Stream stream)
        {
            Document     pdfDocument  = new Document(_path);
            TextAbsorber textAbsorber = new TextAbsorber();

            using (TextWriter tw = new StreamWriter(stream, Encoding.Default))
            {
                for (int i = 1; i <= pdfDocument.Pages.Count; i++)
                {
                    Page page = pdfDocument.Pages[i];
                    page.Accept(textAbsorber);
                    tw.Write(textAbsorber.Text);
                }
            }
        }
Пример #30
0
        public static string btnTextExport_Click(string fileType)
        {
            Document doc = new Document(HttpContext.Current.Server.MapPath("Convert/Export.pdf"));

            switch (fileType)
            {
            case "txt":

                //create TextAbsorber object to extract text
                TextAbsorber textAbsorber = new TextAbsorber();

                //accept the absorber for all the pages
                doc.Pages.Accept(textAbsorber);

                //get the extracted text
                string extractedText = textAbsorber.Text;

                System.IO.File.WriteAllText(HttpContext.Current.Server.MapPath("Convert/output.txt"), extractedText);
                return("output.txt");

            case "pdf":
                return("Export.pdf");

            case "docx":
                doc.Save(HttpContext.Current.Server.MapPath("Convert/output.docx"), SaveFormat.DocX);
                return("output.docx");

            case "svg":
                doc.Save(HttpContext.Current.Server.MapPath("Convert/output.svg"), SaveFormat.Svg);
                return("output.svg");

            case "xps":
                doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xps"), SaveFormat.Xps);
                return("output.xps");

            case "xls":
                doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xls"), SaveFormat.Excel);
                return("output.xls");

            case "html":
                doc.Save(HttpContext.Current.Server.MapPath("Convert/output.html"), SaveFormat.Html);
                return("output.html");

            default:
                return("");
            }
        }
Пример #31
0
        protected override string ExtractText(string extensionName, byte[] data)
        {
            StringBuilder stringBuilder = new StringBuilder();

            using (MemoryStream memoryStream = new MemoryStream(data))
            {
                using (Document doc = new Document(memoryStream))
                {
                    TextAbsorber textAbsorber = new TextAbsorber();

                    doc.Pages.Accept(textAbsorber);

                    stringBuilder.Append(textAbsorber.Text);
                }
            }
            return(Regex.Replace(stringBuilder.ToString(), "Evaluation Only. Created with Aspose[\\S|\\s]* Aspose Pty Ltd.", ""));
        }
Пример #32
0
 // ExStart:ShowLinkAnnotations
 public static void ShowLinkAnnotations(Page page)
 {
     foreach (Aspose.Pdf.Annotations.Annotation annot in page.Annotations)
     {
         if (annot is LinkAnnotation)
         {
             // Print the URL of each Link Annotation
             Console.WriteLine("URI: " + ((annot as LinkAnnotation).Action as GoToURIAction).URI);
             TextAbsorber absorber = new TextAbsorber();
             absorber.TextSearchOptions.LimitToPageBounds = true;
             absorber.TextSearchOptions.Rectangle         = annot.Rect;
             page.Accept(absorber);
             string extractedText = absorber.Text;
             // Print the text associated with hyperlink
             Console.WriteLine(extractedText);
         }
     }
 }
Пример #33
0
        /// <summary>
        /// Сравнение двух PDF файлов по string этих файлов
        /// </summary>
        public static void ComparePDFbyString(string testDirectoty, string productionDirectory, string fileName)
        {
            Stream licenseStream = LicenseStr.LicenseStream;

            new License().SetLicense(licenseStream);
            Document     pdfDocumentTest  = new Document(testDirectoty + fileName);
            TextAbsorber textAbsorberTest = new TextAbsorber();

            pdfDocumentTest.Pages.Accept(textAbsorberTest);
            string       extractedTextTestDocument = textAbsorberTest.Text;
            Document     pdfDocumentProduction     = new Document(productionDirectory + fileName);
            TextAbsorber textAbsorberProduction    = new TextAbsorber();

            pdfDocumentProduction.Pages.Accept(textAbsorberProduction);
            string extractedTextProductionDocument = textAbsorberProduction.Text;

            Assert.AreEqual(extractedTextTestDocument, extractedTextProductionDocument);
        }
Пример #34
0
        public IHttpActionResult ExportFile(string fileType, string folder)
        {
            var fullPath        = string.Format("{0}Editor\\{1}", Config.Configuration.WorkingDirectory.Replace("/", "\\"), folder);
            var pdfDocumentPath = string.Format("{0}\\document.pdf", fullPath);
            var doc             = new Document(pdfDocumentPath);

            try
            {
                switch (fileType)
                {
                case "txt":
                    TextAbsorber textAbsorber = new TextAbsorber();
                    doc.Pages.Accept(textAbsorber);
                    string extractedText = textAbsorber.Text;
                    File.WriteAllText(string.Format("{0}\\document.txt", fullPath), extractedText);
                    break;

                case "docx":
                    doc.Save(string.Format("{0}\\document.docx", fullPath), SaveFormat.DocX);
                    break;

                case "svg":
                    doc.Save(string.Format("{0}\\document.svg", fullPath), SaveFormat.Svg);
                    break;

                case "xps":
                    doc.Save(string.Format("{0}\\document.xps", fullPath), SaveFormat.Xps);
                    break;

                case "xls":
                    doc.Save(string.Format("{0}\\document.xlsx", fullPath), SaveFormat.Excel);
                    break;

                case "html":
                    doc.Save(string.Format("{0}\\document.html", fullPath), SaveFormat.Html);
                    break;
                }
            }
            catch (Exception e)
            {
                return(InternalServerError(e));
            }
            return(Ok(new DocStatusModel()));
        }
Пример #35
0
        public Boolean ReadPdfFile(string fileName, String searthText)
        {
            if (System.IO.File.Exists(fileName))
            {
                InjectAsposeLicemse();

                var document = new Aspose.Pdf.Document(fileName);

                TextAbsorber textAbsorber = new TextAbsorber();
                document.Pages.Accept(textAbsorber);
                String extractedText = textAbsorber.Text;
                textAbsorber.Visit(document);
                if (extractedText.Contains(searthText, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(true);
                }
            }
            return(false);
        }
        public static void UsingScaleFactor()
        {
            // ExStart:UsingScaleFactor
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            // Open document
            Document pdfDocument = new Document(dataDir + "ExtractTextPage.pdf");

            TextAbsorber textAbsorber = new TextAbsorber();
            textAbsorber.ExtractionOptions = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Pure);
            // Setting scale factor to 0.5 is enough to split columns in the majority of documents
            // Setting of zero allows to algorithm choose scale factor automatically
            textAbsorber.ExtractionOptions.ScaleFactor = 0.5; /* 0; */
            pdfDocument.Pages.Accept(textAbsorber);
            String extractedText = textAbsorber.Text;
            System.IO.File.WriteAllText( dataDir + "ExtractTextUsingScaleFactor_out.text", extractedText);
            // ExEnd:UsingScaleFactor
        }
Пример #37
0
        // This method gets called by the runtime. Use this method to configure the HTTP request pipeline.
        public void Configure(IApplicationBuilder app, IHostingEnvironment env)
        {
            if (env.IsDevelopment())
            {
                app.UseDeveloperExceptionPage();
            }

            app.Run(async(context) =>
            {
                var pdf = new  Aspose.Pdf.Document("hola.pdf");
                // Create TextAbsorber object to extract text
                TextAbsorber textAbsorber = new TextAbsorber(new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Raw));
                // Accept the absorber for all the pages
                pdf.Pages[1].Accept(textAbsorber);
                String extractedText = textAbsorber.Text;
                //string greeting = Configuration["My:Greeting"];
                await context.Response.WriteAsync(extractedText);
            });
        }
        // ExStart:ShowLinkAnnotations
        public static void ShowLinkAnnotations(Page page)
        {
            foreach (Aspose.Pdf.Annotations.Annotation annot in page.Annotations)
            {
                if (annot is LinkAnnotation)
                {
                    // Print the URL of each Link Annotation
                    Console.WriteLine("URI: " + ((annot as LinkAnnotation).Action as GoToURIAction).URI);
                    TextAbsorber absorber = new TextAbsorber();
                    absorber.TextSearchOptions.LimitToPageBounds = true;
                    absorber.TextSearchOptions.Rectangle = annot.Rect;
                    page.Accept(absorber);
                    string extractedText = absorber.Text;
                    // Print the text associated with hyperlink
                    Console.WriteLine(extractedText);
                }

            }
        }
        public void UpdateSdtContent(bool updateSdtContent)
        {
            //ExStart
            //ExFor:SaveOptions.UpdateSdtContent
            //ExSummary:Shows how to update structured document tags while saving a document to PDF.
            Document doc = new Document();

            // Insert a drop-down list structured document tag.
            StructuredDocumentTag tag = new StructuredDocumentTag(doc, SdtType.DropDownList, MarkupLevel.Block);

            tag.ListItems.Add(new SdtListItem("Value 1"));
            tag.ListItems.Add(new SdtListItem("Value 2"));
            tag.ListItems.Add(new SdtListItem("Value 3"));

            // The drop-down list currently displays "Choose an item" as the default text.
            // Set the "SelectedValue" property to one of the list items to get the tag to
            // display that list item's value instead of the default text.
            tag.ListItems.SelectedValue = tag.ListItems[1];

            doc.FirstSection.Body.AppendChild(tag);

            // Create a "PdfSaveOptions" object to pass to the document's "Save" method
            // to modify how that method saves the document to .PDF.
            PdfSaveOptions options = new PdfSaveOptions();

            // Set the "UpdateSdtContent" property to "false" not to update the structured document tags
            // while saving the document to PDF. They will display their default values as they were at the time of construction.
            // Set the "UpdateSdtContent" property to "true" to make sure the tags display updated values in the PDF.
            options.UpdateSdtContent = updateSdtContent;

            doc.Save(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf", options);
            //ExEnd

#if NET462 || NETCOREAPP2_1 || JAVA
            Aspose.Pdf.Document pdfDoc       = new Aspose.Pdf.Document(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf");
            TextAbsorber        textAbsorber = new TextAbsorber();
            textAbsorber.Visit(pdfDoc);

            Assert.AreEqual(updateSdtContent ? "Value 2" : "Choose an item.",
                            textAbsorber.Text);
#endif
        }
Пример #40
0
        public void SuppressHyphens(bool suppressAutoHyphens)
        {
            //ExStart
            //ExFor:ParagraphFormat.SuppressAutoHyphens
            //ExSummary:Shows how to suppress hyphenation for a paragraph.
            Hyphenation.RegisterDictionary("de-CH", MyDir + "hyph_de_CH.dic");

            Assert.True(Hyphenation.IsDictionaryRegistered("de-CH"));

            // Open a document containing text with a locale matching that of our dictionary.
            // When we save this document to a fixed page save format, its text will have hyphenation.
            Document doc = new Document(MyDir + "German text.docx");

            // We can set the "SuppressAutoHyphens" property to "true" to disable hyphenation
            // for a specific paragraph while keeping it enabled for the rest of the document.
            // The default value for this property is "false",
            // which means every paragraph by default uses hyphenation if any is available.
            doc.FirstSection.Body.FirstParagraph.ParagraphFormat.SuppressAutoHyphens = suppressAutoHyphens;

            doc.Save(ArtifactsDir + "ParagraphFormat.SuppressHyphens.pdf");
            //ExEnd

#if NET462 || NETCOREAPP2_1 || JAVA
            Aspose.Pdf.Document pdfDoc       = new Aspose.Pdf.Document(ArtifactsDir + "ParagraphFormat.SuppressHyphens.pdf");
            TextAbsorber        textAbsorber = new TextAbsorber();
            textAbsorber.Visit(pdfDoc);

            if (suppressAutoHyphens)
            {
                Assert.True(textAbsorber.Text.Contains("La  ob  storen  an  deinen  am  sachen. \r\n" +
                                                       "Doppelte  um  da  am  spateren  verlogen \r\n" +
                                                       "gekommen  achtzehn  blaulich."));
            }
            else
            {
                Assert.True(textAbsorber.Text.Contains("La ob storen an deinen am sachen. Dop-\r\n" +
                                                       "pelte  um  da  am  spateren  verlogen  ge-\r\n" +
                                                       "kommen  achtzehn  blaulich."));
            }
#endif
        }
Пример #41
0
        public void UpdateSdtContent(bool updateSdtContent)
        {
            //ExStart
            //ExFor:SaveOptions.UpdateSdtContent
            //ExSummary:Shows how structured document tags can be updated while saving to .pdf.
            Document doc = new Document();

            // Insert two StructuredDocumentTags; a date and a drop-down list
            StructuredDocumentTag tag = new StructuredDocumentTag(doc, SdtType.Date, MarkupLevel.Block);

            tag.FullDate = DateTime.Now;

            doc.FirstSection.Body.AppendChild(tag);

            tag = new StructuredDocumentTag(doc, SdtType.DropDownList, MarkupLevel.Block);
            tag.ListItems.Add(new SdtListItem("Value 1"));
            tag.ListItems.Add(new SdtListItem("Value 2"));
            tag.ListItems.Add(new SdtListItem("Value 3"));
            tag.ListItems.SelectedValue = tag.ListItems[1];

            doc.FirstSection.Body.AppendChild(tag);

            // We've selected default values for both tags
            // We can save those values in the document without immediately updating the tags, leaving them in their default state
            // by using a SaveOptions object with this flag set
            PdfSaveOptions options = new PdfSaveOptions();

            options.UpdateSdtContent = updateSdtContent;

            doc.Save(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf", options);
            //ExEnd

#if NET462 || NETCOREAPP2_1 || JAVA
            Aspose.Pdf.Document pdfDoc       = new Aspose.Pdf.Document(ArtifactsDir + "StructuredDocumentTag.UpdateSdtContent.pdf");
            TextAbsorber        textAbsorber = new TextAbsorber();
            textAbsorber.Visit(pdfDoc);

            Assert.AreEqual(updateSdtContent ? "Value 2" : $"Click here to enter a date.{Environment.NewLine}Choose an item.",
                            textAbsorber.Text);
#endif
        }
Пример #42
0
        static void TestRead() 
        {
            Document pdfDocument = new Document(Config.TestPdf);

            //create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();

            //accept the absorber for a particular page
            pdfDocument.Pages[104].Accept(textAbsorber);

            //get the extracted text
            string extractedText = textAbsorber.Text;

            // create a writer and open the file
            TextWriter tw = new StreamWriter(Console.OpenStandardOutput(), Encoding.Default);

            // write a line of text to the file
            tw.WriteLine(extractedText);

            // close the stream
            tw.Close();
        }
Пример #43
0
        public static bool CheckLicense()
        {

            Document pdfDocument = new Document(HttpContext.Current.Server.MapPath("Convert/output.pdf"));

            // Create TextAbsorber object to extract text
            TextAbsorber textAbsorber = new TextAbsorber();
            
            // Accept the absorber for all the pages
            pdfDocument.Pages[1].Accept(textAbsorber);
            
            // Get the extracted text
            String extractedText = textAbsorber.Text;
            
            if (extractedText.Contains("Evaluation Only. Created with Aspose.Pdf"))
            {
                return false;
            }
            else
            {
                return true;
            }
        }
Пример #44
0
        public static string btnTextExport_Click(string fileType)
        {
          Document doc = new Document(HttpContext.Current.Server.MapPath("Convert/Export.pdf"));

                switch (fileType)
                {
                    case "txt":

                        //create TextAbsorber object to extract text
                        TextAbsorber textAbsorber = new TextAbsorber();

                        //accept the absorber for all the pages
                        doc.Pages.Accept(textAbsorber);

                        //get the extracted text
                        string extractedText = textAbsorber.Text;

                        System.IO.File.WriteAllText(HttpContext.Current.Server.MapPath("Convert/output.txt"), extractedText);
                        return "output.txt";
                    case "pdf":
                        return "Export.pdf";
                    case "docx":
                        doc.Save(HttpContext.Current.Server.MapPath("Convert/output.docx"), SaveFormat.DocX);
                        return "output.docx";
                    case "svg":
                        doc.Save(HttpContext.Current.Server.MapPath("Convert/output.svg"), SaveFormat.Svg);
                        return "output.svg";
                    case "xps":
                        doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xps"), SaveFormat.Xps);
                        return "output.xps";
                    case "xls":
                        doc.Save(HttpContext.Current.Server.MapPath("Convert/output.xls"), SaveFormat.Excel);
                        return "output.xls";
                    case "html":
                        doc.Save(HttpContext.Current.Server.MapPath("Convert/output.html"), SaveFormat.Html);
                        return "output.html";
                    default:
                        return "";
                }
           

        }