static void Main(string[] args) { // Create Bytescout.PDFExtractor.XMLExtractor instance XMLExtractor xmlExtractor = new XMLExtractor(); xmlExtractor.RegistrationName = "demo"; xmlExtractor.RegistrationKey = "demo"; // Create Bytescout.PDFExtractor.TableDetector instance TableDetector tableDetector = new TableDetector(); tableDetector.RegistrationKey = "demo"; tableDetector.RegistrationName = "demo"; // We should define what kind of tables we should detect. // So we set min required number of columns to 3 ... tableDetector.DetectionMinNumberOfColumns = 3; // ... and we set min required number of columns to 3 tableDetector.DetectionMinNumberOfRows = 3; // Load sample PDF document xmlExtractor.LoadDocumentFromFile(@".\sample3.pdf"); tableDetector.LoadDocumentFromFile(@".\sample3.pdf"); // Get page count int pageCount = tableDetector.GetPageCount(); for (int i = 0; i < pageCount; i++) { int t = 1; // Find first table and continue if found if (tableDetector.FindTable(i)) { do { // Set extraction area for XML extractor to rectangle received from the table detector xmlExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Export the table to XML file xmlExtractor.SavePageXMLToFile(i, "page-" + i + "-table-" + t + ".xml"); t++; }while (tableDetector.FindNextTable()); // search next table } } xmlExtractor.Dispose(); tableDetector.Dispose(); // Open first output file in default associated application (for demo purposes) System.Diagnostics.Process.Start("page-0-table-1.xml"); }
static void Main() { // Create XMLExtractor instance XMLExtractor extractor = new XMLExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("interactiveform.pdf"); // Get PDF document text as XML string xmlText = extractor.GetXML(); // Load XML XmlDocument xmlDocument = new XmlDocument(); xmlDocument.LoadXml(xmlText); // Select all "control" nodes XmlNodeList formControls = xmlDocument.SelectNodes("//control"); if (formControls != null) { foreach (XmlNode formControl in formControls) { XmlAttribute typeAttribute = formControl.Attributes["type"]; // Trace filled textboxes if (typeAttribute.Value == "editbox") { if (!String.IsNullOrEmpty(formControl.InnerText)) { Trace.WriteLine("EDITBOX " + formControl.Attributes["id"].Value + ": " + formControl.InnerText); } } // Trace checked checkboxes else if (typeAttribute.Value == "checkbox") { if (formControl.Attributes["state"].Value == "1") { Trace.WriteLine("CHECKBOX " + formControl.Attributes["id"].Value + ": " + formControl.Attributes["state"].Value); } } } } }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.XMLExtractor instance XMLExtractor extractor = new XMLExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); extractor.SaveXMLToFile("output.XML"); Console.WriteLine(); Console.WriteLine("Data has been extracted to 'output.XML' file."); Console.WriteLine(); Console.WriteLine("Press any key to continue and open XML in default XML viewer..."); Console.ReadKey(); Process.Start("output.XML"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.XMLExtractor instance XMLExtractor extractor = new XMLExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); extractor.SaveXMLToFile("output.XML"); // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Data has been extracted to 'output.XML' file."); Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.XMLExtractor instance XMLExtractor extractor = new XMLExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Add Contrast //extractor.OCRImagePreprocessingFilters.AddContrast(20); // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing // filters for your specific document. // See "OCR Analyser" example. // Save extracted text to file extractor.SaveXMLToFile("output.xml"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.xml"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
private void PdfExtract(string fileToOpen) { // Create Bytescout.PDFExtractor.XMLExtractor instance XMLExtractor extractor = new XMLExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(fileToOpen); extractor.SaveXMLToFile("output.XML"); RoutesBoxList.Items.Add("Data has been extracted to 'output.XML' file."); RoutesBoxList.Items.Add("Press any key to continue and open XML in default XML viewer)..."); Process.Start("output.XML"); }
static void Main(string[] args) { // Create TextExtractor instance TextExtractor textExtractor = new TextExtractor("demo", "demo"); textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch; // Set exact search (default is SmartSearch that works like in Adobe Reader) // Create XMLExtractor instance XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo"); // Load document textExtractor.LoadDocumentFromFile("Invoice.pdf"); xmlExtractor.LoadDocumentFromFile("Invoice.pdf"); // Results string invoiceNo = string.Empty; string invoiceDate = string.Empty; string total = string.Empty; string tableData = string.Empty; // Iterate pages for (int i = 0; i < textExtractor.GetPageCount(); i++) { RectangleF pageRectangle = textExtractor.GetPageRectangle(i); RectangleF tableRect = new RectangleF(0, 0, pageRectangle.Width, 0); // Search for "Invoice No." if (textExtractor.Find(i, "Invoice No.", false)) { // Get the found text rectangle RectangleF textRect = textExtractor.FoundText.Bounds; // Assume the text at right is the invoice number. // Shift the rectangle to the right: textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; // Set the extraction region and extract the text textExtractor.SetExtractionArea(textRect); invoiceNo = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Invoice Date" and extract text at right if (textExtractor.Find(i, "Invoice Date", false)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); invoiceDate = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Quantity" keyword to detect the top of the tabular data rectangle if (textExtractor.Find(i, "Quantity", false)) { // Keep the top table coordinate tableRect.Y = textExtractor.FoundText.Bounds.Top; // use textRect.Bottom if you want to skip column headers } // Search for "TOTAL" (it will be also the bottom of tabular data rectangle) if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); total = textExtractor.GetTextFromPage(i).Trim(); // Calculate the table height tableRect.Height = textRect.Top - tableRect.Top; } // Extract tabular data using XMLExtractor if (tableRect.Height > 0) { xmlExtractor.SetExtractionArea(tableRect); tableData = xmlExtractor.GetXMLFromPage(i); } } // Display extracted data Console.WriteLine("Invoice No.: " + invoiceNo); Console.WriteLine("Invoice Date: " + invoiceDate); Console.WriteLine("TOTAL: " + total); Console.WriteLine("Table Data: "); Console.WriteLine(tableData); Console.WriteLine("Press any key..."); Console.ReadKey(); }
static void Main() { // Create XMLExtractor instance XMLExtractor extractor = new XMLExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\filled_form.pdf"); // Get PDF document text as XML string xmlText = extractor.GetXML(); // Load XML XmlDocument xmlDocument = new XmlDocument(); xmlDocument.LoadXml(xmlText); // Select all "control" nodes XmlNodeList formControls = xmlDocument.SelectNodes("//control"); if (formControls != null) { foreach (XmlNode controlNode in formControls) { XmlAttribute typeAttribute = controlNode.Attributes["type"]; // Show textboxes if (typeAttribute.Value == "editbox") { Console.WriteLine("EDITBOX:"); Console.WriteLine(" id = " + controlNode.Attributes["id"].Value); Console.WriteLine(" text = " + controlNode.InnerText); } // Show checkboxes else if (typeAttribute.Value == "checkbox") { Console.WriteLine("CHECKBOX:"); Console.WriteLine(" id = " + controlNode.Attributes["id"].Value); Console.WriteLine(" state = " + controlNode.Attributes["state"].Value); } // Show radio-buttons else if (typeAttribute.Value == "radiobutton") { Console.WriteLine("RADIOBUTTON:"); Console.WriteLine(" group = " + controlNode.Attributes["id"].Value); Console.WriteLine(" state = " + controlNode.Attributes["state"].Value); Console.WriteLine(" value = " + controlNode.InnerText); } // Show comboboxes else if (typeAttribute.Value == "combobox") { Console.WriteLine("COMBOBOX:"); Console.WriteLine(" id = " + controlNode.Attributes["id"].Value); // list items: foreach (XmlNode valueNode in controlNode.SelectNodes("values/value")) { if (valueNode.Attributes["selected"]?.Value == "true") { Console.WriteLine(" value (selected) = " + valueNode.InnerText); } else { Console.WriteLine(" value = " + valueNode.InnerText); } } } // Show listboxes else if (typeAttribute.Value == "listbox") { Console.WriteLine("LISTBOX:"); Console.WriteLine(" id = " + controlNode.Attributes["id"].Value); // list items: foreach (XmlNode valueNode in controlNode.SelectNodes("values/value")) { if (valueNode.Attributes["selected"]?.Value == "true") { Console.WriteLine(" value (selected) = " + valueNode.InnerText); } else { Console.WriteLine(" value = " + valueNode.InnerText); } } } } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }