XMLExtractor C# (CSharp)代码示例

示例#1

0

显示文件

文件： Program.cs 项目： babylon3389/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor();

            xmlExtractor.RegistrationName = "demo";
            xmlExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector instance
            TableDetector tableDetector = new TableDetector();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // We should define what kind of tables we should detect.
            // So we set min required number of columns to 3 ...
            tableDetector.DetectionMinNumberOfColumns = 3;
            // ... and we set min required number of columns to 3
            tableDetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            xmlExtractor.LoadDocumentFromFile(@".\sample3.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample3.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int t = 1;
                // Find first table and continue if found
                if (tableDetector.FindTable(i))
                {
                    do
                    {
                        // Set extraction area for XML extractor to rectangle received from the table detector
                        xmlExtractor.SetExtractionArea(tableDetector.FoundTableLocation);
                        // Export the table to XML file
                        xmlExtractor.SavePageXMLToFile(i, "page-" + i + "-table-" + t + ".xml");
                        t++;
                    }while (tableDetector.FindNextTable()); // search next table
                }
            }

            xmlExtractor.Dispose();
            tableDetector.Dispose();

            // Open first output file in default associated application (for demo purposes)
            System.Diagnostics.Process.Start("page-0-table-1.xml");
        }

示例#2

0

显示文件

        static void Main()
        {
            // Create XMLExtractor instance
            XMLExtractor extractor = new XMLExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("interactiveform.pdf");

            // Get PDF document text as XML
            string xmlText = extractor.GetXML();

            // Load XML
            XmlDocument xmlDocument = new XmlDocument();

            xmlDocument.LoadXml(xmlText);

            // Select all "control" nodes
            XmlNodeList formControls = xmlDocument.SelectNodes("//control");

            if (formControls != null)
            {
                foreach (XmlNode formControl in formControls)
                {
                    XmlAttribute typeAttribute = formControl.Attributes["type"];

                    // Trace filled textboxes
                    if (typeAttribute.Value == "editbox")
                    {
                        if (!String.IsNullOrEmpty(formControl.InnerText))
                        {
                            Trace.WriteLine("EDITBOX " + formControl.Attributes["id"].Value + ": " + formControl.InnerText);
                        }
                    }
                    // Trace checked checkboxes
                    else if (typeAttribute.Value == "checkbox")
                    {
                        if (formControl.Attributes["state"].Value == "1")
                        {
                            Trace.WriteLine("CHECKBOX " + formControl.Attributes["id"].Value + ": " + formControl.Attributes["state"].Value);
                        }
                    }
                }
            }
        }

示例#3

0

显示文件

文件： Program.cs 项目： remlex/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.XMLExtractor instance
            XMLExtractor extractor = new XMLExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            extractor.SaveXMLToFile("output.XML");

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to 'output.XML' file.");
            Console.WriteLine();
            Console.WriteLine("Press any key to continue and open XML in default XML viewer...");
            Console.ReadKey();

            Process.Start("output.XML");
        }

示例#4

0

显示文件

文件： Program.cs 项目： repohoarder/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.XMLExtractor instance
            XMLExtractor extractor = new XMLExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            extractor.SaveXMLToFile("output.XML");

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to 'output.XML' file.");
            Console.WriteLine();
            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }

示例#5

0

显示文件

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.XMLExtractor instance
            XMLExtractor extractor = new XMLExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // You can also apply various preprocessing filters
            // to improve the recognition on low-quality scans.

            // Automatically deskew skewed scans
            //extractor.OCRImagePreprocessingFilters.AddDeskew();

            // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
            //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
            //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

            // Repair broken letters
            //extractor.OCRImagePreprocessingFilters.AddDilate();

            // Remove noise
            //extractor.OCRImagePreprocessingFilters.AddMedian();

            // Apply Gamma Correction
            //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

            // Add Contrast
            //extractor.OCRImagePreprocessingFilters.AddContrast(20);


            // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
            // filters for your specific document.
            // See "OCR Analyser" example.


            // Save extracted text to file
            extractor.SaveXMLToFile("output.xml");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.xml");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

示例#6

0

显示文件

文件： Form1.cs 项目： Transoffice/AppJsonImporter

        private void PdfExtract(string fileToOpen)
        {
            // Create Bytescout.PDFExtractor.XMLExtractor instance
            XMLExtractor extractor = new XMLExtractor();
            extractor.RegistrationName = "demo";
            extractor.RegistrationKey = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(fileToOpen);

            extractor.SaveXMLToFile("output.XML");

            RoutesBoxList.Items.Add("Data has been extracted to 'output.XML' file.");
            RoutesBoxList.Items.Add("Press any key to continue and open XML in default XML viewer)...");

            Process.Start("output.XML");
        }

示例#7

0

显示文件

文件： Program.cs 项目： remlex/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }

示例#8

0

显示文件

        static void Main()
        {
            // Create XMLExtractor instance
            XMLExtractor extractor = new XMLExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\filled_form.pdf");

            // Get PDF document text as XML
            string xmlText = extractor.GetXML();

            // Load XML
            XmlDocument xmlDocument = new XmlDocument();

            xmlDocument.LoadXml(xmlText);

            // Select all "control" nodes
            XmlNodeList formControls = xmlDocument.SelectNodes("//control");

            if (formControls != null)
            {
                foreach (XmlNode controlNode in formControls)
                {
                    XmlAttribute typeAttribute = controlNode.Attributes["type"];

                    // Show textboxes
                    if (typeAttribute.Value == "editbox")
                    {
                        Console.WriteLine("EDITBOX:");
                        Console.WriteLine("  id = " + controlNode.Attributes["id"].Value);
                        Console.WriteLine("  text = " + controlNode.InnerText);
                    }
                    // Show checkboxes
                    else if (typeAttribute.Value == "checkbox")
                    {
                        Console.WriteLine("CHECKBOX:");
                        Console.WriteLine("  id = " + controlNode.Attributes["id"].Value);
                        Console.WriteLine("  state = " + controlNode.Attributes["state"].Value);
                    }
                    // Show radio-buttons
                    else if (typeAttribute.Value == "radiobutton")
                    {
                        Console.WriteLine("RADIOBUTTON:");
                        Console.WriteLine("  group = " + controlNode.Attributes["id"].Value);
                        Console.WriteLine("  state = " + controlNode.Attributes["state"].Value);
                        Console.WriteLine("  value = " + controlNode.InnerText);
                    }
                    // Show comboboxes
                    else if (typeAttribute.Value == "combobox")
                    {
                        Console.WriteLine("COMBOBOX:");
                        Console.WriteLine("  id = " + controlNode.Attributes["id"].Value);
                        // list items:
                        foreach (XmlNode valueNode in controlNode.SelectNodes("values/value"))
                        {
                            if (valueNode.Attributes["selected"]?.Value == "true")
                            {
                                Console.WriteLine("  value (selected) = " + valueNode.InnerText);
                            }
                            else
                            {
                                Console.WriteLine("  value = " + valueNode.InnerText);
                            }
                        }
                    }
                    // Show listboxes
                    else if (typeAttribute.Value == "listbox")
                    {
                        Console.WriteLine("LISTBOX:");
                        Console.WriteLine("  id = " + controlNode.Attributes["id"].Value);
                        // list items:
                        foreach (XmlNode valueNode in controlNode.SelectNodes("values/value"))
                        {
                            if (valueNode.Attributes["selected"]?.Value == "true")
                            {
                                Console.WriteLine("  value (selected) = " + valueNode.InnerText);
                            }
                            else
                            {
                                Console.WriteLine("  value = " + valueNode.InnerText);
                            }
                        }
                    }
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }

C# (CSharp) XMLExtractor示例