public static void Run() { //ExStart: SearchTextWithDotNetRegex string dataDir = RunExamples.GetDataDir_AsposePdf_Text(); // Create Regex object to find all words System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"[\S]+"); // Open document Aspose.Pdf.Document document = new Aspose.Pdf.Document(dataDir + "SearchTextRegex.pdf"); // Get a particular page Page page = document.Pages[1]; // Create TextAbsorber object to find all instances of the input regex TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber(regex); textFragmentAbsorber.TextSearchOptions.IsRegularExpressionUsed = true; // Accept the absorber for the page page.Accept(textFragmentAbsorber); // Get the extracted text fragments TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments; // Loop through the fragments foreach (TextFragment textFragment in textFragmentCollection) { Console.WriteLine(textFragment.Text); } //ExEnd: SearchTextWithDotNetRegex }
public void CreateIndex(Analyzer analayer) { FSDirectory fsDir = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder)); IndexWriter indexWriter = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); Stopwatch stopWatch = Stopwatch.StartNew(); int analyzedCount = 0; string[] files = System.IO.Directory.GetFiles(_textFilesFolder, this._fileSearchPattern, SearchOption.AllDirectories); //统计需要索引的文件页数 int totalPages = GetTotalPages(files); WriteLog("Total pages statistics takes {0}ms", stopWatch.Elapsed.Milliseconds); stopWatch.Restart(); TextAbsorber textAbsorber = new TextAbsorber(); //开始索引 foreach (string pdfFile in files) { var fileInfo = new FileInfo(pdfFile); var fileName = fileInfo.Name; Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(pdfFile); WriteLog("Current file is {0}", pdfFile); //注意pdf页码从1开始 for (int i = 1; i <= pdfDocument.Pages.Count; i++) { Page page = pdfDocument.Pages[i]; page.Accept(textAbsorber); string pageContent = textAbsorber.Text; Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field(LuceneConfig.Field_Path, pdfFile, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(LuceneConfig.Field_FileName, fileName, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_PageNumber, i.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_ContentByPage, pageContent, Field.Store.NO, Field.Index.ANALYZED)); indexWriter.AddDocument(doc); analyzedCount++; RaiseProgressChanged(analyzedCount * 100 / totalPages); } } indexWriter.Optimize(); indexWriter.Dispose(); stopWatch.Stop(); Console.WriteLine("All completed. It takes {0}ms", stopWatch.Elapsed); }
public static void Run() { // ExStart:StrikeOutWords // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_Annotations(); // Open document Document document = new Document(dataDir + "input.pdf"); // Create TextFragment Absorber instance to search particular text fragment Aspose.Pdf.Text.TextFragmentAbsorber textFragmentAbsorber = new Aspose.Pdf.Text.TextFragmentAbsorber("Estoque"); // Iterate through pages of PDF document for (int i = 1; i <= document.Pages.Count; i++) { // Get first page of PDF document Page page = document.Pages[1]; page.Accept(textFragmentAbsorber); } // Create a collection of Absorbed text Aspose.Pdf.Text.TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments; // Iterate on above collection for (int j = 1; j <= textFragmentCollection.Count; j++) { Aspose.Pdf.Text.TextFragment textFragment = textFragmentCollection[j]; // Get rectangular dimensions of TextFragment object Aspose.Pdf.Rectangle rect = new Aspose.Pdf.Rectangle( (float)textFragment.Position.XIndent, (float)textFragment.Position.YIndent, (float)textFragment.Position.XIndent + (float)textFragment.Rectangle.Width, (float)textFragment.Position.YIndent + (float)textFragment.Rectangle.Height); // Instantiate StrikeOut Annotation instance StrikeOutAnnotation strikeOut = new StrikeOutAnnotation(textFragment.Page, rect); // Set opacity for annotation strikeOut.Opacity = .80f; // Set the border for annotation instance strikeOut.Border = new Border(strikeOut); // Set the color of annotation strikeOut.Color = Aspose.Pdf.Color.Red; // Add annotation to annotations collection of TextFragment textFragment.Page.Annotations.Add(strikeOut); } dataDir = dataDir + "StrikeOutWords_out.pdf"; document.Save(dataDir); // ExEnd:StrikeOutWords Console.WriteLine("\nWords strikeout successfully.\nFile saved at " + dataDir); }
public void ExportToText(Stream stream) { Document pdfDocument = new Document(_path); TextAbsorber textAbsorber = new TextAbsorber(); using (TextWriter tw = new StreamWriter(stream, Encoding.Default)) { for (int i = 1; i <= pdfDocument.Pages.Count; i++) { Page page = pdfDocument.Pages[i]; page.Accept(textAbsorber); tw.Write(textAbsorber.Text); } } }
// ExStart:ShowLinkAnnotations public static void ShowLinkAnnotations(Page page) { foreach (Aspose.Pdf.Annotations.Annotation annot in page.Annotations) { if (annot is LinkAnnotation) { // Print the URL of each Link Annotation Console.WriteLine("URI: " + ((annot as LinkAnnotation).Action as GoToURIAction).URI); TextAbsorber absorber = new TextAbsorber(); absorber.TextSearchOptions.LimitToPageBounds = true; absorber.TextSearchOptions.Rectangle = annot.Rect; page.Accept(absorber); string extractedText = absorber.Text; // Print the text associated with hyperlink Console.WriteLine(extractedText); } } }
public static void Run() { // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_LinksActions(); //Open document Document document = new Document(dataDir + "ExtractLinks.pdf"); //Extract actions Page page = document.Pages[1]; AnnotationSelector selector = new AnnotationSelector(new LinkAnnotation(page, Aspose.Pdf.Rectangle.Trivial)); page.Accept(selector); IList list = selector.Selected; Annotation annotation = (Annotation)list[0]; //Save updated document document.Save(dataDir + "ExtractLinks_out.pdf"); }
public static void Main(string[] args) { // The path to the documents directory. string dataDir = Path.GetFullPath("../../../Data/"); //Open document Document document = new Document(dataDir + "input.pdf"); //Extract actions Page page = document.Pages[1]; AnnotationSelector selector = new AnnotationSelector(new LinkAnnotation(page, Aspose.Pdf.Rectangle.Trivial)); page.Accept(selector); IList list = selector.Selected; Annotation annotation = (Annotation)list[0]; //Save updated document document.Save(dataDir + "output.pdf"); }
public static void Run() { // ExStart:ExtractLinks // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_LinksActions(); // Open document Document document = new Document(dataDir + "ExtractLinks.pdf"); // Extract actions Page page = document.Pages[1]; AnnotationSelector selector = new AnnotationSelector(new LinkAnnotation(page, Aspose.Pdf.Rectangle.Trivial)); page.Accept(selector); IList <Annotation> list = selector.Selected; Annotation annotation = (Annotation)list[0]; dataDir = dataDir + "ExtractLinks_out.pdf"; // Save updated document document.Save(dataDir); // ExEnd:ExtractLinks Console.WriteLine("\nLinks extracted successfully.\nFile saved at " + dataDir); }
public static void Run() { try { // ExStart:HighlightCharacterInPDF // The path to the documents directory. string dataDir = RunExamples.GetDataDir_AsposePdf_Text(); int resolution = 150; Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(dataDir + "input.pdf"); using (MemoryStream ms = new MemoryStream()) { PdfConverter conv = new PdfConverter(pdfDocument); conv.Resolution = new Resolution(resolution, resolution); conv.GetNextImage(ms, System.Drawing.Imaging.ImageFormat.Png); Bitmap bmp = (Bitmap)Bitmap.FromStream(ms); using (System.Drawing.Graphics gr = System.Drawing.Graphics.FromImage(bmp)) { float scale = resolution / 72f; gr.Transform = new System.Drawing.Drawing2D.Matrix(scale, 0, 0, -scale, 0, bmp.Height); for (int i = 0; i < pdfDocument.Pages.Count; i++) { Page page = pdfDocument.Pages[1]; // Create TextAbsorber object to find all words TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber(@"[\S]+"); textFragmentAbsorber.TextSearchOptions.IsRegularExpressionUsed = true; page.Accept(textFragmentAbsorber); // Get the extracted text fragments TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments; // Loop through the fragments foreach (TextFragment textFragment in textFragmentCollection) { if (i == 0) { gr.DrawRectangle( Pens.Yellow, (float)textFragment.Position.XIndent, (float)textFragment.Position.YIndent, (float)textFragment.Rectangle.Width, (float)textFragment.Rectangle.Height); for (int segNum = 1; segNum <= textFragment.Segments.Count; segNum++) { TextSegment segment = textFragment.Segments[segNum]; for (int charNum = 1; charNum <= segment.Characters.Count; charNum++) { CharInfo characterInfo = segment.Characters[charNum]; Aspose.Pdf.Rectangle rect = page.GetPageRect(true); Console.WriteLine("TextFragment = " + textFragment.Text + " Page URY = " + rect.URY + " TextFragment URY = " + textFragment.Rectangle.URY); gr.DrawRectangle( Pens.Black, (float)characterInfo.Rectangle.LLX, (float)characterInfo.Rectangle.LLY, (float)characterInfo.Rectangle.Width, (float)characterInfo.Rectangle.Height); } gr.DrawRectangle( Pens.Green, (float)segment.Rectangle.LLX, (float)segment.Rectangle.LLY, (float)segment.Rectangle.Width, (float)segment.Rectangle.Height); } } } } } dataDir = dataDir + "HighlightCharacterInPDF_out.png"; bmp.Save(dataDir, System.Drawing.Imaging.ImageFormat.Png); } // ExEnd:HighlightCharacterInPDF Console.WriteLine("\nCharacters highlighted successfully in pdf document.\nFile saved at " + dataDir); } catch (Exception ex) { Console.WriteLine(ex.Message + "\nThis example will only work if you apply a valid Aspose License. You can purchase full license or get 30 day temporary license from http:// Www.aspose.com/purchase/default.aspx."); } }