示例#1
0
        protected void Page_Load(object sender, EventArgs e)
        {
            String inputFile = Server.MapPath(@".\bin\columns.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            // Get dimensions of the first document page
            RectangleF rectangle = extractor.GetPageRectangle(0);

            // Get text from the 1/3 of the page

            rectangle.Width = rectangle.Width / 3f;

            Response.Clear();
            Response.ContentType = "text/html";

            extractor.SetExtractionArea(rectangle);

            Response.Write("<pre>");

            // Save extracted text to output stream
            extractor.SavePageTextToStream(0, Response.OutputStream);

            Response.Write("</pre>");

            Response.End();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            int pageCount = extractor.GetPageCount();

            // Search each page for some keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "References", false))
                {
                    // If page contains the keyword, extract a text from it.
                    // For demonstration we'll extract the text from top part of the page only
                    extractor.SetExtractionArea(0, 0, 600, 200);
                    string text = extractor.GetTextFromPage(i);
                    Console.WriteLine(text);
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
示例#3
0
        private void btnGetData_Click(object sender, EventArgs e)
        {
            StringBuilder result = new StringBuilder();

            RectangleF[] selection = pdfViewerControl1.SelectionInPoints;

            using (TextExtractor extractor = new TextExtractor())
            {
                extractor.RegistrationName = "demo";
                extractor.RegistrationKey  = "demo";

                extractor.LoadDocumentFromFile(pdfViewerControl1.InputFile);
                extractor.OCRMode = OCRMode.Auto;
                extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";
                extractor.OCRResolution         = 300;

                for (int i = 0; i < selection.Length; i++)
                {
                    extractor.SetExtractionArea(selection[i]);
                    result.AppendLine(_dataLabels[i]);
                    result.AppendLine(extractor.GetText(pdfViewerControl1.CurrentPageIndex, pdfViewerControl1.CurrentPageIndex));
                    result.AppendLine();
                }
            }

            MessageBox.Show(result.ToString());
        }
示例#4
0
        //public string ReadFromPositionSpire()
        //{
        //    PdfPageBase page = Document.Pages[0];
        //    string text = page.ExtractText(new RectangleF(50, 50, 500, 100));
        //    StringBuilder sb = new StringBuilder();
        //    sb.AppendLine(text);
        //    return sb.ToString();
        //    return string.Empty;
        //}

        public string BytescoutPDFExtractor(string path)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            path = @"C:\Users\zulfiqar\Downloads\ExpenseClaimForm1_b2abe30fabca4b1ca322fafd74306ceb (1).pdf";

            // load the document
            extractor.LoadDocumentFromFile(path);

            // get page count
            //int pageCount = extractor.GetPageCount();
            //int count = 0;

            // iterate through pages


            // define rectangle location to extract from
            RectangleF location = new RectangleF(0, 0, 200, 200);

            // set extraction area
            extractor.SetExtractionArea(location);

            // extract text bounded by the extraction area
            string extractedString = extractor.GetTextFromPage(0);

            return(extractedString);
        }
示例#5
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("columns.pdf");


            // read width of the very first page (zero index)
            float pageWidth  = extractor.GetPageRect_Width(0);
            float pageHeight = extractor.GetPageRect_Height(0);

            // now we are extracting content assuming we have 3 columns
            // equally distributed on pages

            // first calculate the width of the one column by dividing page width by number of columns (3)
            float columnWidth = pageWidth / 3f;

            // iterate through 3 columns
            for (int i = 0; i < 3; i++)
            {
                // set the extraction area to the #i column
                extractor.SetExtractionArea(i * columnWidth, 0, columnWidth, pageHeight);

                string outFileName = "columns-column" + i + ".txt";
                extractor.SavePageTextToFile(0, outFileName);

                // Open output file in default associated application
                System.Diagnostics.Process.Start(outFileName);
            }
        }
        /// <summary>
        /// Get text from particular region
        /// </summary>
        private static string GetTextFromRegion(TextExtractor textExtractor, RectangleF extractionRegion, int pageIndex = 0)
        {
            // Set Extraction Area
            textExtractor.SetExtractionArea(extractionRegion);

            // Get Text from that region
            return(textExtractor.GetTextFromPage(pageIndex));
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor textExtractor = new TextExtractor();

            textExtractor.RegistrationName = "demo";
            textExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector instance
            TableDetector tableDetector = new TableDetector();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
            tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

            // We should define what kind of tables we should detect.
            // So we set min required number of columns to 3 ...
            tableDetector.DetectionMinNumberOfColumns = 3;
            // ... and we set min required number of rows to 3
            tableDetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            textExtractor.LoadDocumentFromFile(@".\sample3.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample3.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int t = 1;
                // Find first table and continue if found
                if (tableDetector.FindTable(i))
                {
                    do
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        textExtractor.SetExtractionArea(tableDetector.FoundTableLocation);
                        // Export the table to TEXT file
                        textExtractor.SavePageTextToFile(i, "page-" + i + "-table-" + t + ".txt");
                        t++;
                    }while (tableDetector.FindNextTable()); // search next table
                }
            }

            // Cleanup
            textExtractor.Dispose();
            tableDetector.Dispose();

            // Open first output file in default associated application (for demo purposes)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
示例#8
0
        private void btnRunOCR_Click(object sender, EventArgs e)
        {
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(pdfViewerControl1.InputFile);

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of "tessdata" folder containing language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
            // Find more language files at https://github.com/tesseract-ocr/tessdata/tree/3.04.00

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // Set the extraction area to the viewer's selection rectangle
            RectangleF[] selection = pdfViewerControl1.SelectionInPoints;
            if (selection.Length > 0)
            {
                extractor.SetExtractionArea(selection[0]);
            }

            // Show wait cursor
            Cursor = Cursors.WaitCursor;

            try
            {
                // Perform OCR and save result to file
                extractor.SavePageTextToFile(pdfViewerControl1.CurrentPageIndex, "result.txt");
            }
            finally
            {
                // Revert cursor to default
                Cursor = Cursors.Default;
            }

            // Cleanup
            extractor.Dispose();

            // Open output file in default associated application
            System.Diagnostics.Process.Start("result.txt");
        }
示例#9
0
        protected void Page_Load(object sender, EventArgs e)
        {
            // This test file will be copied to the project directory on the pre-build event (see the project properties).
            String inputFile = Server.MapPath("columns.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            // read width of the very first page (zero index)
            float pageWidth  = extractor.GetPageRect_Width(0);
            float pageHeight = extractor.GetPageRect_Height(0);

            // now we are extracting content assuming we have 3 columns
            // equally distributed on pages

            // first calculate the width of the one column by dividing page width by number of columns (3)
            float columnWidth = pageWidth / 3f;


            Response.Clear();
            Response.ContentType = "text/html";


            // iterate through 3 columns
            for (int i = 0; i < 3; i++)
            {
                // set the extraction area to the #i column
                extractor.SetExtractionArea(i * columnWidth, 0, columnWidth, pageHeight);

                // Save extracted text to output stream
                extractor.SavePageTextToStream(0, Response.OutputStream);
            }



            Response.End();
        }
示例#10
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // Set extraction area
                extractor.SetExtractionArea(location);

                // Extract text from the extraction area
                string text = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":");
                Console.WriteLine();
                Console.WriteLine(text);

                // Reset the extraction area
                extractor.ResetExtractionArea();

                Console.WriteLine();
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
示例#11
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // load the document
            extractor.LoadDocumentFromFile("../../sample2.pdf");

            // get page count
            int pageCount = extractor.GetPageCount();
            int count     = 0;

            // iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // set extraction area
                extractor.SetExtractionArea(location);

                // extract text bounded by the extraction area
                string extractedString = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString);

                // reset extraction area to full page (by default)
                extractor.ResetExtractionArea();

                Console.WriteLine("\r\n");
            }



            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
示例#12
0
        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            // Input document
            string inputDocument = @".\sample_ocr.pdf";

            // Document page index
            int pageIndex = 0;

            // Area of the document page to perform the analysis (optional).
            // RectangleF.Empty means the full page.
            RectangleF rectangle = RectangleF.Empty; // new RectangleF(100, 50, 350, 250);

            // Location of language data files
            string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

            // OCR language
            string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder

            // Find more language files at https://github.com/bytescout/ocrdata


            // Create OCRAnalyzer instance and activate it with your registration information
            using (OCRAnalyzer ocrAnalyzer = new OCRAnalyzer("demo", "demo"))
            {
                // Display analysis progress
                ocrAnalyzer.ProgressChanged += (object sender, string message, double progress, ref bool cancel) =>
                {
                    Console.WriteLine(message);
                };

                // Load document to OCRAnalyzer
                ocrAnalyzer.LoadDocumentFromFile(inputDocument);

                // Setup OCRAnalyzer
                ocrAnalyzer.OCRLanguage           = ocrLanguage;
                ocrAnalyzer.OCRLanguageDataFolder = ocrLanguageDataFolder;

                // Set page area for analysis (optional)
                ocrAnalyzer.SetExtractionArea(rectangle);

                // Perform analysis and get results
                OCRAnalysisResults analysisResults = ocrAnalyzer.AnalyzeByOCRConfidence(pageIndex);


                // Now extract the text using detected OCR parameters

                string outputDocument = @".\result.txt";

                // Create TextExtractor instance
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Load document to TextExtractor
                    textExtractor.LoadDocumentFromFile(inputDocument);

                    // Setup TextExtractor
                    textExtractor.OCRMode = OCRMode.Auto;
                    textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
                    textExtractor.OCRLanguage           = ocrLanguage;

                    // Apply analysis results to TextExtractor instance
                    ocrAnalyzer.ApplyResults(analysisResults, textExtractor);

                    // Set extraction area (optional)
                    textExtractor.SetExtractionArea(rectangle);

                    // Save extracted text to file
                    textExtractor.SaveTextToFile(outputDocument);

                    // Open result document in default associated application (for demo purpose)
                    ProcessStartInfo processStartInfo = new ProcessStartInfo(outputDocument);
                    processStartInfo.UseShellExecute = true;
                    Process.Start(processStartInfo);
                }
            }
        }