protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath("words-with-hyphens.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); Response.Clear(); Response.ContentType = "text/html"; Rectangle location; int pageIndex; Response.Write("Searching for \"hyphen\" string:<br><br>"); // Search for "hyphen" string if (extractor.Find(0, "hyphen", false)) { do { Response.Write("Found at location " + extractor.FoundText.Bounds.ToString() + "<br>"); } while (extractor.FindNext()); } Response.End(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("words-with-hyphens.pdf"); int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "hyphen" string if (extractor.Find(i, "hyphen", false)) { do { Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); }while (extractor.FindNext()); } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
private void Button_Find(object sender, RoutedEventArgs e) { if (textBoxFind.Text.Length > 0) { StringBuilder builder = new StringBuilder(); builder.AppendLine("Searching for \"" + textBoxFind.Text + "\""); if (extractor.Find(0, textBoxFind.Text, false)) { do { builder.AppendLine(""); builder.AppendLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); builder.AppendLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { builder.AppendLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); builder.AppendLine("Text: " + element.Text); builder.AppendLine("Font is bold: " + element.FontIsBold); builder.AppendLine("Font is italic:" + element.FontIsItalic); builder.AppendLine("Font name: " + element.FontName); builder.AppendLine("Font size:" + element.FontSize); builder.AppendLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } builder.AppendLine("Finished."); textBox1.Text = builder.ToString(); } }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\Invoice.pdf"); extractor.RegexSearch = true; // Enable the regular expressions int pageCount = extractor.GetPageCount(); // Search through pages for (int i = 0; i < pageCount; i++) { // Search dates in format 12/31/1999 string regexPattern = "[0-9]{2}/[0-9]{2}/[0-9]{4}"; // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds); Console.WriteLine(""); // Iterate through each element in the found text foreach (ISearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine(" Text: " + element.Text); Console.WriteLine(" Font is bold: " + element.FontIsBold); Console.WriteLine(" Font is italic: " + element.FontIsItalic); Console.WriteLine(" Font name: " + element.FontName); Console.WriteLine(" Font size: " + element.FontSize); Console.WriteLine(" Font color: " + element.FontColor); Console.WriteLine(); } }while (extractor.FindNext()); } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { const string inputFile = @"sample.pdf"; const int pageIndex = 0; const float renderingResolution = 300f; const string searchPattern = "\\d+\\.\\d+"; // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { textExtractor.RegexSearch = true; textExtractor.LoadDocumentFromFile(inputFile); // Preapre RasterRenderer using (RasterRenderer rasterRenderer = new RasterRenderer("demo", "demo")) { rasterRenderer.LoadDocumentFromFile(inputFile); // Render document page to image Image image = rasterRenderer.GetImage(pageIndex, renderingResolution); // Prepare highlight brush Brush highlightBrush = new SolidBrush(Color.FromArgb(128, Color.Yellow)); using (Graphics graphics = Graphics.FromImage(image)) { // Search for pattern and paint found piecese if (textExtractor.Find(pageIndex, searchPattern, caseSensitive: false)) { do { foreach (var foundPiece in textExtractor.FoundText.Elements) { // Convert from document Points to pixels Rectangle pixelRect = new Rectangle( (int)(foundPiece.Bounds.Left / 72f * renderingResolution), (int)(foundPiece.Bounds.Top / 72f * renderingResolution), (int)(foundPiece.Bounds.Width / 72f * renderingResolution), (int)(foundPiece.Bounds.Height / 72f * renderingResolution) ); // Paint rectangle graphics.FillRectangle(highlightBrush, pixelRect); } } while (textExtractor.FindNext()); } } image.Save("result.png"); Process.Start("result.png"); } } }
/* * IF YOU SEE TEMPORARY FOLDER ACCESS ERRORS: * * Temporary folder access is required for web application when you use ByteScout SDK in it. * If you are getting errors related to the access to temporary folder like "Access to the path 'C:\Windows\TEMP\... is denied" then you need to add permission for this temporary folder to make ByteScout SDK working on that machine and IIS configuration because ByteScout SDK requires access to temp folder to cache some of its data for more efficient work. * * SOLUTION: * * If your IIS Application Pool has "Load User Profile" option enabled the IIS provides access to user's temp folder. Check user's temporary folder * * If you are running Web Application under an impersonated account or IIS_IUSRS group, IIS may redirect all requests into separate temp folder like "c:\temp\". * * In this case * - check the User or User Group your web application is running under * - then add permissions for this User or User Group to read and write into that temp folder (c:\temp or c:\windows\temp\ folder) * - restart your web application and try again * */ protected void Page_Load(object sender, EventArgs e) { String inputFile = Server.MapPath(@".\bin\sample1.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); // Set the matching mode. // WordMatchingMode.None - treats the search string as substring // WordMatchingMode.ExactMatch - treats the search string as separate word // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader). extractor.WordMatchingMode = WordMatchingMode.ExactMatch; Response.Clear(); Response.ContentType = "text/html"; Response.Write("Searching for \"ipsum\" string:<br>"); // Search for "ipsum" string if (extractor.Find(0, "ipsum", false)) { do { Response.Write("<br/>"); Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds + "<br/>"); Response.Write("<br/>"); // The found text may be splitted to parts. // Iterate through each part of the found text. for (var i = 0; i < extractor.FoundText.Elements.Count; i++) { ISearchResultElement element = extractor.FoundText.Elements[i]; Response.Write("Element #" + i + " at " + element.Bounds + "<br/>"); Response.Write("Text: " + element.Text + "<br/>"); Response.Write("Font is bold: " + element.FontIsBold + "<br/>"); Response.Write("Font is italic:" + element.FontIsItalic + "<br/>"); Response.Write("Font name: " + element.FontName + "<br/>"); Response.Write("Font size:" + element.FontSize + "<br/>"); Response.Write("Font color:" + element.FontColor + "<br/>"); } }while (extractor.FindNext()); } Response.End(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample1.pdf"); // Set the matching mode. // WordMatchingMode.None - treats the search string as substring // WordMatchingMode.ExactMatch - treats the search string as separate word // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader). extractor.WordMatchingMode = WordMatchingMode.ExactMatch; int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "ipsum" string if (extractor.Find(i, "ipsum", false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // Iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); int pageCount = extractor.GetPageCount(); extractor.RegexSearch = true; // ' turn on the regular expression search // search through pages for (int i = 0; i < pageCount; i++) { // searches for the text starting from LABORIS and ending with VELIT words string regexPattern = "LABORIS.*VELIT"; // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { try { // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("SampleInvoice.pdf"); extractor.RegexSearch = true; // Enable the regular expressions int pageCount = extractor.GetPageCount(); // Search through pages for (int i = 0; i < pageCount; i++) { // Search credit card number in format of (XXXX XXXX XXXX XXXX) string regexPattern = @"[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}"; // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { // Iterate through each element in the found text foreach (ISearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Found Credit Card Number: " + element.Text); } }while (extractor.FindNext()); } } } } catch (Exception ex) { Console.WriteLine("Error: " + ex.Message); } Console.WriteLine(); Console.WriteLine("Press enter key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { const string inputFile = @"sample.pdf"; const int pageIndex = 0; const string searchPattern = "\\d+\\.\\d+"; // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { textExtractor.RegexSearch = true; textExtractor.LoadDocumentFromFile(inputFile); // Load document with PDF SDK using (Document pdfDocument = new Document(inputFile)) { pdfDocument.RegistrationName = "demo"; pdfDocument.RegistrationKey = "demo"; Page pdfDocumentPage = pdfDocument.Pages[pageIndex]; Canvas canvas = pdfDocumentPage.Canvas; SolidBrush fillBrush = new SolidBrush(new ColorRGB(255, 0, 0)); fillBrush.Opacity = 50; // make the brush transparent // Search for pattern and highlight found pieces if (textExtractor.Find(pageIndex, searchPattern, caseSensitive: false)) { do { foreach (var foundPiece in textExtractor.FoundText.Elements) { // Inflate the rectangle a bit RectangleF rect = RectangleF.Inflate(foundPiece.Bounds, 1, 2); // Draw rectangle over the PDF page canvas.DrawRectangle(fillBrush, rect); } } while (textExtractor.FindNext()); } // Save as new PDF document pdfDocument.Save("result.pdf"); // Open result document in default associated application (for demo purposes) Process.Start("result.pdf"); } } }
protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath("sample1.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); Response.Clear(); Response.ContentType = "text/html"; Rectangle location; int pageIndex; Response.Write("Searching for \"ipsum\" string:<br><br>"); // Search for "ipsum" string if (extractor.Find(0, "ipsum", false)) { do { Response.Write("<br/>"); Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds.ToString() + "<br/>"); Response.Write("<br/>"); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Response.Write("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height + "<br/>"); Response.Write("Text: " + element.Text + "<br/>"); Response.Write("Font is bold: " + element.FontIsBold + "<br/>"); Response.Write("Font is italic:" + element.FontIsItalic + "<br/>"); Response.Write("Font name: " + element.FontName + "<br/>"); Response.Write("Font size:" + element.FontSize + "<br/>"); Response.Write("Font color:" + element.FontColor + "<br/>"); } }while (extractor.FindNext()); } Response.End(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "ipsum" string if (extractor.Find(i, "ipsum", false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load the document extractor.LoadDocumentFromFile("sample2.pdf"); // Smart match the search string like Adobe Reader extractor.WordMatchingMode = WordMatchingMode.SmartMatch; string searchString = "land"; // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Search for text string if (extractor.Find(i, searchString, false)) { do { // Output search results Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); // Now we are getting the found text string extractedString = extractor.FoundText.Text; Console.WriteLine("Found text: " + extractedString); }while (extractor.FindNext()); // Search next occurrence of the search string } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
private void Button_Find(object sender, RoutedEventArgs e) { if (textBoxFind.Text.Length > 0) { StringBuilder builder = new StringBuilder(); builder.AppendLine("Searching for \"" + textBoxFind.Text + "\""); if (extractor.Find(0, textBoxFind.Text, false)) { do { builder.AppendLine("Found on page 0 at location " + extractor.FoundText.Location.ToString()); }while (extractor.FindNext()); } builder.AppendLine("Finished."); textBox1.Text = builder.ToString(); } }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // load the document extractor.LoadDocumentFromFile("sample2.pdf"); string searchString = "what"; // get page count int pageCount = extractor.GetPageCount(); int count = 0; // iterate through pages for (int i = 0; i < pageCount; i++) { // search for text string if (extractor.Find(i, searchString, false)) { do { count++; // output search results Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); // now we are getting the found text string extractedString = extractor.FoundText.Text; Console.WriteLine("Extracted string: " + extractedString); }while (extractor.FindNext()); // search next occurance of the search string } } Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }