static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); int pageCount = extractor.GetPageCount(); extractor.RegexSearch = true; // ' turn on the regular expression search // search through pages for (int i = 0; i < pageCount; i++) { // searches for the text starting from LABORIS and ending with VELIT words string regexPattern = "LABORIS.*VELIT"; // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of "tessdata" folder containing language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentations errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Open output file in default associated application System.Diagnostics.Process.Start("output.txt"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Apply predefined profiles extractor.Profiles = "ocr, newspaper-layout"; // Extract text to file extractor.SaveTextToFile("result1.txt"); extractor.Reset(); // Load another document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Load and apply custom profiles extractor.LoadProfiles("profiles.json"); extractor.Profiles = "keep-formatting, ocr-forced-200dpi"; // Extract text to file extractor.SaveTextToFile("result2.txt"); // Cleanup extractor.Dispose(); // See result files in "bin\Debug" folder }
protected void Page_Load(object sender, EventArgs e) { String inputFile = Server.MapPath(@".\bin\words-with-hyphens.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); // Set the matching mode extractor.WordMatchingMode = WordMatchingMode.SmartMatch; Response.Clear(); Response.ContentType = "text/html"; Response.Write("Searching for \"hyphen\" string:<br>"); // Search for "ipsum" string if (extractor.Find(0, "hyphen", false)) { do { Response.Write("<br/>"); Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds + "<br/>"); Response.Write("<br/>"); // The found text may be splitted to parts. // Iterate through each part of the found text. for (var i = 0; i < extractor.FoundText.Elements.Count; i++) { ISearchResultElement element = extractor.FoundText.Elements[i]; Response.Write("Element #" + i + " at " + element.Bounds + "<br/>"); Response.Write("Text: " + element.Text + "<br/>"); Response.Write("Font is bold: " + element.FontIsBold + "<br/>"); Response.Write("Font is italic:" + element.FontIsItalic + "<br/>"); Response.Write("Font name: " + element.FontName + "<br/>"); Response.Write("Font size:" + element.FontSize + "<br/>"); Response.Write("Font color:" + element.FontColor + "<br/>"); } }while (extractor.FindNext()); } Response.End(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample1.pdf"); // Set the matching mode. // WordMatchingMode.None - treats the search string as substring // WordMatchingMode.ExactMatch - treats the search string as separate word // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader). extractor.WordMatchingMode = WordMatchingMode.ExactMatch; int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "ipsum" string if (extractor.Find(i, "ipsum", false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // Iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
private static void ThreadProc(object stateInfo) { int threadIndex = (int)((object[])stateInfo)[0]; ManualResetEvent allFinishedEvent = (ManualResetEvent)((object[])stateInfo)[1]; string inputFile = (string)((object[])stateInfo)[2]; string outputFile = (string)((object[])stateInfo)[3]; int startPage = (int)((object[])stateInfo)[4]; int endPage = (int)((object[])stateInfo)[5]; try { Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage); Stopwatch stopwatch = Stopwatch.StartNew(); // Process the piece using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Set page separator. Default is '\f' (Form Feed) textExtractor.PageSeparator = Environment.NewLine; // Since we are only extracting text, disable the caching to reduce memory usage textExtractor.PageDataCaching = PageDataCaching.None; textExtractor.OCRMode = OCRMode.Auto; textExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata\"; textExtractor.OCRLanguage = "eng"; // 300 DPI resolution is recommended. // Using of higher values will slow down the processing but does not guarantee the higher quality. textExtractor.OCRResolution = 300; textExtractor.LoadDocumentFromFile(inputFile); textExtractor.SaveTextToFile(startPage, endPage, outputFile); } Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed); } finally { // If it was the last thread, signal the main thread about the finish. if (Interlocked.Decrement(ref _runningThreadsCounter) == 0) { allFinishedEvent.Set(); } // Release semaphore _threadLimiter.Release(); } }
static void Main(string[] args) { try { // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("SampleInvoice.pdf"); extractor.RegexSearch = true; // Enable the regular expressions int pageCount = extractor.GetPageCount(); // Search through pages for (int i = 0; i < pageCount; i++) { // Search credit card number in format of (XXXX XXXX XXXX XXXX) string regexPattern = @"[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}"; // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { // Iterate through each element in the found text foreach (ISearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Found Credit Card Number: " + element.Text); } }while (extractor.FindNext()); } } } } catch (Exception ex) { Console.WriteLine("Error: " + ex.Message); } Console.WriteLine(); Console.WriteLine("Press enter key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { const string inputFile = @"sample.pdf"; const int pageIndex = 0; const string searchPattern = "\\d+\\.\\d+"; // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { textExtractor.RegexSearch = true; textExtractor.LoadDocumentFromFile(inputFile); // Load document with PDF SDK using (Document pdfDocument = new Document(inputFile)) { pdfDocument.RegistrationName = "demo"; pdfDocument.RegistrationKey = "demo"; Page pdfDocumentPage = pdfDocument.Pages[pageIndex]; Canvas canvas = pdfDocumentPage.Canvas; SolidBrush fillBrush = new SolidBrush(new ColorRGB(255, 0, 0)); fillBrush.Opacity = 50; // make the brush transparent // Search for pattern and highlight found pieces if (textExtractor.Find(pageIndex, searchPattern, caseSensitive: false)) { do { foreach (var foundPiece in textExtractor.FoundText.Elements) { // Inflate the rectangle a bit RectangleF rect = RectangleF.Inflate(foundPiece.Bounds, 1, 2); // Draw rectangle over the PDF page canvas.DrawRectangle(fillBrush, rect); } } while (textExtractor.FindNext()); } // Save as new PDF document pdfDocument.Save("result.pdf"); // Open result document in default associated application (for demo purposes) Process.Start("result.pdf"); } } }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover2 instance Remover2 remover = new Remover2("demo", "demo"); // Mask removed text, which ultimately black out region remover.MaskRemovedText = true; // Load sample PDF document remover.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Search SSN in format 202-55-0130 // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; // Search results ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Remove text objects find by SearchResults. remover.AddTextToRemove(searchResults); // Perform removal of specified objects remover.PerformRemoval(@"result1.pdf"); } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); // Clean up. remover.Dispose(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample2.pdf"); // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Open output file in default associated application System.Diagnostics.Process.Start("output.txt"); }
protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath("sample1.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); Response.Clear(); Response.ContentType = "text/html"; Rectangle location; int pageIndex; Response.Write("Searching for \"ipsum\" string:<br><br>"); // Search for "ipsum" string if (extractor.Find(0, "ipsum", false)) { do { Response.Write("<br/>"); Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds.ToString() + "<br/>"); Response.Write("<br/>"); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Response.Write("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height + "<br/>"); Response.Write("Text: " + element.Text + "<br/>"); Response.Write("Font is bold: " + element.FontIsBold + "<br/>"); Response.Write("Font is italic:" + element.FontIsItalic + "<br/>"); Response.Write("Font name: " + element.FontName + "<br/>"); Response.Write("Font size:" + element.FontSize + "<br/>"); Response.Write("Font color:" + element.FontColor + "<br/>"); } }while (extractor.FindNext()); } Response.End(); }
static void Main(string[] args) { var allInputFiles = new string[] { "Sample_Files\\InvoiceMar.pdf", "Sample_Files\\InvoiceApr.pdf", "Sample_Files\\InvoiceApr_Forged.pdf" }; var settingJson = File.ReadAllText("settings.json"); var deserializer = new JavaScriptSerializer(); // Deserialize json to class objects var lstExtractionSettings = deserializer.Deserialize <List <ExtractionSettings> >(settingJson); // Loop through all input files foreach (var itmFile in allInputFiles) { // Create TextExtractor instance using (var textExtractor = new TextExtractor("demo", "demo")) { // Load document from file textExtractor.LoadDocumentFromFile(itmFile); Console.WriteLine("Evaluating File: {0}\n", itmFile); bool isAllCriteriaMatched = true; foreach (var itmSetting in lstExtractionSettings) { // Region to extract from var extractionRegion = new RectangleF(itmSetting.RegionLocation.X, itmSetting.RegionLocation.Y, itmSetting.RegionLocation.Width, itmSetting.RegionLocation.Height); // Get Extracted Value var extractedValue = GetTextFromRegion(textExtractor, extractionRegion); var isCriteriaPassed = (itmSetting.CorrectValue == extractedValue); Console.WriteLine("Region Type: {0}", itmSetting.RegionType); Console.WriteLine("Expected Value: {0}", itmSetting.CorrectValue); Console.WriteLine("Extracted Value: {0}", extractedValue); Console.WriteLine("Criteria Passed: {0}\n", isCriteriaPassed); isAllCriteriaMatched = isCriteriaPassed && isAllCriteriaMatched; } Console.WriteLine("Fraud Detected: {0}", !isAllCriteriaMatched); Console.WriteLine("\n\n-----------------------------\n\n"); } } Console.WriteLine("Press any key..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.InfoExtractor instance InfoExtractor infoExtractor = new InfoExtractor(); infoExtractor.RegistrationName = "demo"; infoExtractor.RegistrationKey = "demo"; TextExtractor textExtractor = new TextExtractor(); textExtractor.RegistrationName = "demo"; textExtractor.RegistrationKey = "demo"; // List all PDF files in directory foreach (string file in Directory.GetFiles(@"..\..\..\..", "*.pdf")) { infoExtractor.LoadDocumentFromFile(file); Console.WriteLine("File Name: " + Path.GetFileName(file)); Console.WriteLine("Page Count: " + infoExtractor.GetPageCount()); Console.WriteLine("Author: " + infoExtractor.Author); Console.WriteLine("Title: " + infoExtractor.Title); Console.WriteLine("Producer: " + infoExtractor.Producer); Console.WriteLine("Subject: " + infoExtractor.Subject); Console.WriteLine("CreationDate: " + infoExtractor.CreationDate); Console.WriteLine("Text (first 2 lines): "); // Load a couple of lines from each document textExtractor.LoadDocumentFromFile(file); using (StringReader stringReader = new StringReader(textExtractor.GetTextFromPage(0))) { Console.WriteLine(stringReader.ReadLine()); Console.WriteLine(stringReader.ReadLine()); } Console.WriteLine(); } // Cleanup infoExtractor.Dispose(); textExtractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
public MainWindow() { InitializeComponent(); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@"C:\Users\toky\Documents\Autogids_Autogids_20180131_008.pdf"); // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Open output file in default associated application System.Diagnostics.Process.Start("output.txt"); }
static void Main(string[] args) { try { //Read all file content... using (TextExtractor extractor = new TextExtractor()) { // Load document extractor.LoadDocumentFromFile("sample.png"); // Extractor Progress event Console.WriteLine("Text Extraction in progress: \n"); extractor.ProgressChanged += Extractor_ProgressChanged; // Set option to repair text extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts; // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; //Read all text var allExtractedText = extractor.GetText(); Console.WriteLine("\n\nExtracted Text:\n\n{0}", allExtractedText); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.WriteLine("Press enter key to exit..."); Console.ReadLine(); }
/// <summary> /// Check whether OCR Operation is required /// </summary> /// <param name="filePath"></param> private static void _CheckOCRRequired(string filePath) { //Read all file content... using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationKey = "demo"; extractor.RegistrationName = "demo"; // Load document extractor.LoadDocumentFromFile(filePath); Console.WriteLine("\n*******************\n\nFilePath: {0}", filePath); int pageIndex = 0; // Identify OCR operation is recommended for page if (extractor.IsOCRRecommendedForPage(pageIndex)) { Console.WriteLine("\nOCR Recommended: True"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; } else { Console.WriteLine("\nOCR Recommended: False"); } //Read all text var allExtractedText = extractor.GetText(); Console.WriteLine("\nExtracted Text:\n{0}\n\n", allExtractedText); } }
protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath("columns.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); // read width of the very first page (zero index) float pageWidth = extractor.GetPageRect_Width(0); float pageHeight = extractor.GetPageRect_Height(0); // now we are extracting content assuming we have 3 columns // equally distributed on pages // first calculate the width of the one column by dividing page width by number of columns (3) float columnWidth = pageWidth / 3f; Response.Clear(); Response.ContentType = "text/html"; // iterate through 3 columns for (int i = 0; i < 3; i++) { // set the extraction area to the #i column extractor.SetExtractionArea(i * columnWidth, 0, columnWidth, pageHeight); // Save extracted text to output stream extractor.SavePageTextToStream(0, Response.OutputStream); } Response.End(); }
static void Main(string[] args) { // Set extraction regions // Use Bytescout Template Editor / Bytescout PDF Multitool or other tool to know region co-ordinates var lstExtractionRegion = new List <RectangleF>(); lstExtractionRegion.Add(new RectangleF(7.5f, 33.8f, 244.5f, 353.3f)); lstExtractionRegion.Add(new RectangleF(273.8f, 201.8f, 247.5f, 198.0f)); lstExtractionRegion.Add(new RectangleF(537.8f, 27.0f, 246.0f, 268.5f)); // Ouput File var resFile = "result.txt"; var sRes = new StringBuilder(string.Empty); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\SampleFoldable.pdf"); // Loop through all extraction regions, and extract text foreach (var oRegion in lstExtractionRegion) { var extractedText = GetTextFromRegion(extractor, oRegion); sRes.AppendLine(extractedText); } // Cleanup extractor.Dispose(); // Write all reslut to output file File.WriteAllText(resFile, sRes.ToString()); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "ipsum" string if (extractor.Find(i, "ipsum", false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample scanned document extractor.LoadDocumentFromFile("InvoiceWithNoise.png"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // Add profiles to fix issues with date. // To deal with wrong V in dates you can use a regular expression. The following will replace only V characters which are located between numbers: extractor.LoadProfiles("profiles.json"); extractor.Profiles = "ocr-dateIssue"; // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\columns.pdf"); // Extract text by columns (useful if PDF document is designed in column layout like a newspaper) extractor.ExtractColumnByColumn = true; // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Open result file in default associated application System.Diagnostics.Process.Start(@".\result.txt"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover instance Remover remover = new Remover("demo", "demo"); // Load sample PDF document remover.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Search SSN in format 202-55-0130 // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Remove text objects find by SearchResults. // NOTE: The removed text might be larger than the specified rectangle. Currently the Remover is unable // to split PDF text objects. remover.RemoveText(searchResults, @"result1.pdf"); } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); // Clean up. remover.Dispose(); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load the document extractor.LoadDocumentFromFile("sample2.pdf"); // Smart match the search string like Adobe Reader extractor.WordMatchingMode = WordMatchingMode.SmartMatch; string searchString = "land"; // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Search for text string if (extractor.Find(i, searchString, false)) { do { // Output search results Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); // Now we are getting the found text string extractedString = extractor.FoundText.Text; Console.WriteLine("Found text: " + extractedString); }while (extractor.FindNext()); // Search next occurrence of the search string } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { string inputFile = @".\sample2.pdf"; // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); int pageCount = extractor.GetPageCount(); // Search each page for a keyword for (int i = 0; i < pageCount; i++) { if (extractor.Find(i, "bombardment", false)) { // Extract page using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) { splitter.OptimizeSplittedDocuments = true; int pageNumber = i + 1; // (!) page number in ExtractPage() is 1-based string outputFile = @".\page" + pageNumber + ".pdf"; splitter.ExtractPage(inputFile, outputFile, pageNumber); Console.WriteLine("Extracted page " + pageNumber + " to file \"" + outputFile + "\""); } } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // Enables max use of CPU and max use of multiple threads during OCR extractor.OCRMaximizeCPUUtilization = true; // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
protected void Page_Load(object sender, EventArgs e) { // This test file will be copied to the project directory on the pre-build event (see the project properties). String inputFile = Server.MapPath("sample2.pdf"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); Response.Clear(); Response.ContentType = "text/html"; // Save extracted text to output stream extractor.SaveTextToStream(Response.OutputStream); Response.End(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); Process.Start(processStartInfo); }
private void Button_Load(object sender, RoutedEventArgs e) { Microsoft.Win32.OpenFileDialog dlg = new Microsoft.Win32.OpenFileDialog(); dlg.DefaultExt = ".pdf"; dlg.Filter = "PDF documents (.pdf)|*.pdf"; bool?result = dlg.ShowDialog(); if (result == true) { try { extractor.LoadDocumentFromFile(dlg.FileName); _pdfFile = dlg.FileName; Title = _pdfFile; } catch (Exception exception) { MessageBox.Show(exception.ToString()); } } }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // Set extraction area extractor.SetExtractionArea(location); // Extract text from the extraction area string text = extractor.GetTextFromPage(i); Console.WriteLine("Extracted from page #" + i + ":"); Console.WriteLine(); Console.WriteLine(text); // Reset the extraction area extractor.ResetExtractionArea(); Console.WriteLine(); } // Cleanup extractor.Dispose(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover instance Remover remover = new Remover("demo", "demo"); // Load sample PDF document remover.LoadDocumentFromFile(@"samplePDF_EmailAddress.pdf"); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"samplePDF_EmailAddress.pdf"); // Search email Addresses // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Remove text objects find by SearchResults. // NOTE: The removed text might be larger than the specified rectangle. Currently the Remover is unable // to split PDF text objects. remover.RemoveText(searchResults, @"result1.pdf"); } // Open output file in default application System.Diagnostics.Process.Start("result1.pdf"); // Clean up. remover.Dispose(); }