Example #1
0
        public static void Run()
        {
            //ExStart: SearchTextWithDotNetRegex
            string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

            // Create Regex object to find all words
            System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"[\S]+");

            // Open document
            Aspose.Pdf.Document document = new Aspose.Pdf.Document(dataDir + "SearchTextRegex.pdf");

            // Get a particular page
            Page page = document.Pages[1];

            // Create TextAbsorber object to find all instances of the input regex
            TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber(regex);

            textFragmentAbsorber.TextSearchOptions.IsRegularExpressionUsed = true;

            // Accept the absorber for the page
            page.Accept(textFragmentAbsorber);

            // Get the extracted text fragments
            TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments;

            // Loop through the fragments
            foreach (TextFragment textFragment in textFragmentCollection)
            {
                Console.WriteLine(textFragment.Text);
            }
            //ExEnd: SearchTextWithDotNetRegex
        }
Example #2
0
        public void CreateIndex(Analyzer analayer)
        {
            FSDirectory fsDir         = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder));
            IndexWriter indexWriter   = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
            Stopwatch   stopWatch     = Stopwatch.StartNew();
            int         analyzedCount = 0;

            string[] files = System.IO.Directory.GetFiles(_textFilesFolder, this._fileSearchPattern, SearchOption.AllDirectories);

            //统计需要索引的文件页数
            int totalPages = GetTotalPages(files);

            WriteLog("Total pages statistics takes {0}ms", stopWatch.Elapsed.Milliseconds);

            stopWatch.Restart();

            TextAbsorber textAbsorber = new TextAbsorber();

            //开始索引
            foreach (string pdfFile in files)
            {
                var fileInfo = new FileInfo(pdfFile);
                var fileName = fileInfo.Name;
                Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(pdfFile);

                WriteLog("Current file is {0}", pdfFile);

                //注意pdf页码从1开始
                for (int i = 1; i <= pdfDocument.Pages.Count; i++)
                {
                    Page page = pdfDocument.Pages[i];
                    page.Accept(textAbsorber);
                    string pageContent = textAbsorber.Text;

                    Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
                    doc.Add(new Field(LuceneConfig.Field_Path, pdfFile, Field.Store.YES, Field.Index.NOT_ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_FileName, fileName, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_PageNumber, i.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(LuceneConfig.Field_ContentByPage, pageContent, Field.Store.NO, Field.Index.ANALYZED));

                    indexWriter.AddDocument(doc);

                    analyzedCount++;

                    RaiseProgressChanged(analyzedCount * 100 / totalPages);
                }
            }

            indexWriter.Optimize();
            indexWriter.Dispose();

            stopWatch.Stop();
            Console.WriteLine("All completed. It takes {0}ms", stopWatch.Elapsed);
        }
        public static void Run()
        {
            // ExStart:StrikeOutWords
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_Annotations();

            // Open document
            Document document = new Document(dataDir + "input.pdf");

            // Create TextFragment Absorber instance to search particular text fragment
            Aspose.Pdf.Text.TextFragmentAbsorber textFragmentAbsorber = new Aspose.Pdf.Text.TextFragmentAbsorber("Estoque");
            // Iterate through pages of PDF document
            for (int i = 1; i <= document.Pages.Count; i++)
            {
                // Get first page of PDF document
                Page page = document.Pages[1];
                page.Accept(textFragmentAbsorber);
            }

            // Create a collection of Absorbed text
            Aspose.Pdf.Text.TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments;

            // Iterate on above collection
            for (int j = 1; j <= textFragmentCollection.Count; j++)
            {
                Aspose.Pdf.Text.TextFragment textFragment = textFragmentCollection[j];

                // Get rectangular dimensions of TextFragment object
                Aspose.Pdf.Rectangle rect = new Aspose.Pdf.Rectangle(
                    (float)textFragment.Position.XIndent,
                    (float)textFragment.Position.YIndent,
                    (float)textFragment.Position.XIndent +
                    (float)textFragment.Rectangle.Width,
                    (float)textFragment.Position.YIndent +
                    (float)textFragment.Rectangle.Height);

                // Instantiate StrikeOut Annotation instance
                StrikeOutAnnotation strikeOut = new StrikeOutAnnotation(textFragment.Page, rect);
                // Set opacity for annotation
                strikeOut.Opacity = .80f;
                // Set the border for annotation instance
                strikeOut.Border = new Border(strikeOut);
                // Set the color of annotation
                strikeOut.Color = Aspose.Pdf.Color.Red;
                // Add annotation to annotations collection of TextFragment
                textFragment.Page.Annotations.Add(strikeOut);
            }
            dataDir = dataDir + "StrikeOutWords_out.pdf";
            document.Save(dataDir);
            // ExEnd:StrikeOutWords
            Console.WriteLine("\nWords strikeout successfully.\nFile saved at " + dataDir);
        }
Example #4
0
        public void ExportToText(Stream stream)
        {
            Document     pdfDocument  = new Document(_path);
            TextAbsorber textAbsorber = new TextAbsorber();

            using (TextWriter tw = new StreamWriter(stream, Encoding.Default))
            {
                for (int i = 1; i <= pdfDocument.Pages.Count; i++)
                {
                    Page page = pdfDocument.Pages[i];
                    page.Accept(textAbsorber);
                    tw.Write(textAbsorber.Text);
                }
            }
        }
Example #5
0
 // ExStart:ShowLinkAnnotations
 public static void ShowLinkAnnotations(Page page)
 {
     foreach (Aspose.Pdf.Annotations.Annotation annot in page.Annotations)
     {
         if (annot is LinkAnnotation)
         {
             // Print the URL of each Link Annotation
             Console.WriteLine("URI: " + ((annot as LinkAnnotation).Action as GoToURIAction).URI);
             TextAbsorber absorber = new TextAbsorber();
             absorber.TextSearchOptions.LimitToPageBounds = true;
             absorber.TextSearchOptions.Rectangle         = annot.Rect;
             page.Accept(absorber);
             string extractedText = absorber.Text;
             // Print the text associated with hyperlink
             Console.WriteLine(extractedText);
         }
     }
 }
        // ExStart:ShowLinkAnnotations
        public static void ShowLinkAnnotations(Page page)
        {
            foreach (Aspose.Pdf.Annotations.Annotation annot in page.Annotations)
            {
                if (annot is LinkAnnotation)
                {
                    // Print the URL of each Link Annotation
                    Console.WriteLine("URI: " + ((annot as LinkAnnotation).Action as GoToURIAction).URI);
                    TextAbsorber absorber = new TextAbsorber();
                    absorber.TextSearchOptions.LimitToPageBounds = true;
                    absorber.TextSearchOptions.Rectangle = annot.Rect;
                    page.Accept(absorber);
                    string extractedText = absorber.Text;
                    // Print the text associated with hyperlink
                    Console.WriteLine(extractedText);
                }

            }
        }
        public static void Run()
        {
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_LinksActions();


            //Open document
            Document document = new Document(dataDir + "ExtractLinks.pdf");

            //Extract actions
            Page page = document.Pages[1];
            AnnotationSelector selector = new AnnotationSelector(new LinkAnnotation(page, Aspose.Pdf.Rectangle.Trivial));

            page.Accept(selector);
            IList list = selector.Selected;

            Annotation annotation = (Annotation)list[0];

            //Save updated document
            document.Save(dataDir + "ExtractLinks_out.pdf");
        }
Example #8
0
        public static void Main(string[] args)
        {
            // The path to the documents directory.
            string dataDir = Path.GetFullPath("../../../Data/");


            //Open document
            Document document = new Document(dataDir + "input.pdf");

            //Extract actions
            Page page = document.Pages[1];
            AnnotationSelector selector = new AnnotationSelector(new LinkAnnotation(page, Aspose.Pdf.Rectangle.Trivial));

            page.Accept(selector);
            IList list = selector.Selected;

            Annotation annotation = (Annotation)list[0];

            //Save updated document
            document.Save(dataDir + "output.pdf");
        }
Example #9
0
        public static void Run()
        {
            // ExStart:ExtractLinks
            // The path to the documents directory.
            string dataDir = RunExamples.GetDataDir_AsposePdf_LinksActions();
            // Open document
            Document document = new Document(dataDir + "ExtractLinks.pdf");
            // Extract actions
            Page page = document.Pages[1];
            AnnotationSelector selector = new AnnotationSelector(new LinkAnnotation(page, Aspose.Pdf.Rectangle.Trivial));

            page.Accept(selector);
            IList <Annotation> list       = selector.Selected;
            Annotation         annotation = (Annotation)list[0];

            dataDir = dataDir + "ExtractLinks_out.pdf";
            // Save updated document
            document.Save(dataDir);
            // ExEnd:ExtractLinks
            Console.WriteLine("\nLinks extracted successfully.\nFile saved at " + dataDir);
        }
        public static void Run()
        {
            try
            {
                // ExStart:HighlightCharacterInPDF
                // The path to the documents directory.
                string dataDir = RunExamples.GetDataDir_AsposePdf_Text();

                int resolution = 150;

                Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(dataDir + "input.pdf");

                using (MemoryStream ms = new MemoryStream())
                {
                    PdfConverter conv = new PdfConverter(pdfDocument);
                    conv.Resolution = new Resolution(resolution, resolution);
                    conv.GetNextImage(ms, System.Drawing.Imaging.ImageFormat.Png);

                    Bitmap bmp = (Bitmap)Bitmap.FromStream(ms);

                    using (System.Drawing.Graphics gr = System.Drawing.Graphics.FromImage(bmp))
                    {
                        float scale = resolution / 72f;
                        gr.Transform = new System.Drawing.Drawing2D.Matrix(scale, 0, 0, -scale, 0, bmp.Height);

                        for (int i = 0; i < pdfDocument.Pages.Count; i++)
                        {
                            Page page = pdfDocument.Pages[1];
                            // Create TextAbsorber object to find all words
                            TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber(@"[\S]+");
                            textFragmentAbsorber.TextSearchOptions.IsRegularExpressionUsed = true;
                            page.Accept(textFragmentAbsorber);
                            // Get the extracted text fragments
                            TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments;
                            // Loop through the fragments
                            foreach (TextFragment textFragment in textFragmentCollection)
                            {
                                if (i == 0)
                                {
                                    gr.DrawRectangle(
                                        Pens.Yellow,
                                        (float)textFragment.Position.XIndent,
                                        (float)textFragment.Position.YIndent,
                                        (float)textFragment.Rectangle.Width,
                                        (float)textFragment.Rectangle.Height);

                                    for (int segNum = 1; segNum <= textFragment.Segments.Count; segNum++)
                                    {
                                        TextSegment segment = textFragment.Segments[segNum];

                                        for (int charNum = 1; charNum <= segment.Characters.Count; charNum++)
                                        {
                                            CharInfo characterInfo = segment.Characters[charNum];

                                            Aspose.Pdf.Rectangle rect = page.GetPageRect(true);
                                            Console.WriteLine("TextFragment = " + textFragment.Text + "    Page URY = " + rect.URY +
                                                              "   TextFragment URY = " + textFragment.Rectangle.URY);

                                            gr.DrawRectangle(
                                                Pens.Black,
                                                (float)characterInfo.Rectangle.LLX,
                                                (float)characterInfo.Rectangle.LLY,
                                                (float)characterInfo.Rectangle.Width,
                                                (float)characterInfo.Rectangle.Height);
                                        }

                                        gr.DrawRectangle(
                                            Pens.Green,
                                            (float)segment.Rectangle.LLX,
                                            (float)segment.Rectangle.LLY,
                                            (float)segment.Rectangle.Width,
                                            (float)segment.Rectangle.Height);
                                    }
                                }
                            }
                        }
                    }
                    dataDir = dataDir + "HighlightCharacterInPDF_out.png";
                    bmp.Save(dataDir, System.Drawing.Imaging.ImageFormat.Png);
                }
                // ExEnd:HighlightCharacterInPDF
                Console.WriteLine("\nCharacters highlighted successfully in pdf document.\nFile saved at " + dataDir);
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message + "\nThis example will only work if you apply a valid Aspose License. You can purchase full license or get 30 day temporary license from http:// Www.aspose.com/purchase/default.aspx.");
            }
        }